emdb 0.9.9

Lightweight, high-performance embedded key-value database. Bitcask-style append-only journal, lock-free sharded hash index, at-rest encryption, sync + async APIs with streaming iterators.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
// Copyright 2026 James Gober. Licensed under Apache-2.0.

//! Storage substrate. Wraps `fsys::JournalHandle` for the write
//! path and a shared `Arc<Mmap>` for zero-copy reads on the same
//! file.
//!
//! ## File layout
//!
//! Two files live alongside each database:
//!
//! - `<path>` — fsys journal file. Bytes 0..N are owned by fsys's
//!   frame format: `[4 magic][4 length][N payload][4 crc]` per
//!   record. Lock-free LSN reservation, group-commit fsync, NVMe
//!   passthrough flush when available.
//! - `<path>.meta` — emdb's sidecar metadata (encryption salt,
//!   verify block, flags). Written via `fsys::Handle::write` for
//!   atomic-replace updates.
//!
//! ## Concurrency
//!
//! - **Writes**: `fsys::JournalHandle` does lock-free LSN
//!   reservation + concurrent `pwrite`. The hot append path holds
//!   no mutex.
//! - **Reads**: `Arc<Mmap>` over the journal file. Readers get a
//!   cheap clone of the Arc; the kernel keeps the mapping alive
//!   even after the writer grows the file (we re-map post-append
//!   when the journal extends past the current mapping; old
//!   readers holding the old Arc continue uninterrupted).
//! - **Sync**: `flush()` calls `journal.sync_through(latest_lsn)`.
//!   fsys coalesces concurrent sync requests into a single
//!   `fdatasync` (or NVMe passthrough flush where supported).
//!
//! ## Crash safety
//!
//! Recovery is delegated to `fsys::JournalReader`: walks frames
//! forward, validates each CRC-32C, stops cleanly at the first
//! malformed tail. The reader's `JournalTailState` distinguishes
//! a clean shutdown from a torn write.

use std::fs::{File, OpenOptions};
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;

use crossbeam_utils::CachePadded;
use memmap2::Mmap;
use parking_lot::{Mutex, RwLock};

use crate::storage::flush::FlushPolicy;
use crate::storage::meta::{self, MetaHeader};
use crate::{Error, Result};

/// fsys frame overhead: 4 magic + 4 length + 4 CRC = 12 bytes.
/// Constant per fsys 0.9.x v1 journal frame format.
const FSYS_FRAME_OVERHEAD: u64 = 12;
/// Number of leading frame-header bytes before the payload starts.
/// 4 magic + 4 length = 8 bytes preceding the payload.
const FSYS_PRE_PAYLOAD_BYTES: u64 = 8;
/// Number of trailing frame bytes after the payload (the CRC).
const FSYS_POST_PAYLOAD_BYTES: u64 = 4;

/// Storage substrate handle. Cheap-clone via `Arc`.
///
/// Held inside an `Arc<Store>` by [`crate::storage::engine::Engine`];
/// every code path that needs to append, sync, or mmap-read goes
/// through this type.
pub(crate) struct Store {
    /// Canonical on-disk path of the journal file.
    path: PathBuf,
    /// fsys journal — the write path. Lock-free append + group-
    /// commit fsync. `Arc` because we share the handle across
    /// engine threads via `Store`'s own `Arc`.
    journal: Arc<fsys::JournalHandle>,
    /// fsys top-level handle for sidecar (meta-file) writes.
    /// Re-used across meta writes so we don't pay the
    /// builder-init cost per write.
    fs: fsys::Handle,
    /// Read-only `File` retained for re-mmap on file growth.
    /// Mutex-guarded so we can re-stat + remap atomically without
    /// racing concurrent writers' growth.
    read_file: Mutex<File>,
    /// Atomically-swapped read mapping. Readers grab a snapshot
    /// via `Arc::clone`; writes remap when the journal extends
    /// past the current mapping length.
    mmap: RwLock<Arc<Mmap>>,
    /// Tracks the byte length covered by the active mapping.
    /// Updated under the mmap write-lock when remapping. Read
    /// lock-free on the writer's append fast path to decide
    /// whether to trigger a remap. `CachePadded` so the high-
    /// frequency atomic load on every write doesn't false-share
    /// with the surrounding fields' cache line.
    mmap_len: CachePadded<AtomicU64>,
    /// Active flush policy. Drives `flush()` semantics:
    /// `OnEachFlush` and `Group` both call `sync_through(latest)`;
    /// `WriteThrough` syncs after every append.
    policy: FlushPolicy,
    /// Decoded meta sidecar. Updated when encryption metadata
    /// changes (verification block on first encrypted open;
    /// salt rotation on key rotation).
    meta: Arc<RwLock<MetaHeader>>,
}

impl std::fmt::Debug for Store {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("Store")
            .field("path", &self.path)
            .field("policy", &self.policy)
            .field("next_lsn", &self.journal.next_lsn().as_u64())
            .finish()
    }
}

impl Store {
    /// Open or create a database at `path` with default flush
    /// policy ([`FlushPolicy::OnEachFlush`]).
    ///
    /// `flags` are encryption / feature bits persisted in the
    /// meta sidecar on a fresh database; ignored on reopen
    /// (existing meta wins).
    pub(crate) fn open(path: PathBuf, flags: u32) -> Result<Self> {
        Self::open_with_policy(path, flags, FlushPolicy::default(), None)
    }

    /// Open or create a database with explicit flush policy.
    ///
    /// On a fresh path: writes a meta sidecar with `flags` and
    /// opens an empty journal. On an existing path: reads the
    /// meta sidecar (validates magic + version + CRC) and opens
    /// the journal in append mode.
    ///
    /// `iouring_sqpoll_idle_ms`, when `Some`, opts the journal's
    /// per-handle io_uring ring into Linux kernel-side `SQPOLL`
    /// submission polling with the given idle window. `None`
    /// uses the conservative non-SQPOLL path. Ignored on macOS /
    /// Windows.
    pub(crate) fn open_with_policy(
        path: PathBuf,
        flags: u32,
        policy: FlushPolicy,
        iouring_sqpoll_idle_ms: Option<u32>,
    ) -> Result<Self> {
        // Build a top-level fsys handle tuned for storage-engine
        // workloads (the preset fsys ships for this exact use:
        // 8 MiB resident buffer pool, 256-deep io_uring ring,
        // 4 K-deep batch queue). `Method::Auto` picks the best
        // primitive for the host (NVMe passthrough flush /
        // io_uring on Linux / `WRITE_THROUGH` on Windows).
        // If the caller opted into SQPOLL, propagate it to the
        // builder so the per-handle ring is set up with
        // `IORING_SETUP_SQPOLL` at construction time.
        let mut fs_builder = fsys::builder().tune_for(fsys::Workload::Database);
        if let Some(idle_ms) = iouring_sqpoll_idle_ms {
            fs_builder = fs_builder.sqpoll(idle_ms);
        }
        let fs = fs_builder
            .build()
            .map_err(|err| Error::Io(std::io::Error::other(format!("fsys init: {err}"))))?;

        // Resolve or create the meta sidecar. Reuse the cached
        // fsys handle so we don't pay the builder-init cost
        // (hardware probe, capability detection) again.
        let meta = match meta::read(&path)? {
            Some(existing) => existing,
            None => {
                let fresh = MetaHeader::fresh(flags);
                meta::write_with(&fs, &path, &fresh)?;
                fresh
            }
        };

        // Open the journal with a long write-lifetime hint
        // (Linux NVMe `F_SET_RW_HINT`) so the SSD groups journal
        // data into long-lived NAND blocks, reducing GC write
        // amplification. No-op on macOS / Windows. Buffered mode
        // (default) keeps the mmap-visibility invariant: once
        // `append` returns, the bytes are in the OS page cache
        // and any subsequent mmap covering that offset will see
        // them.
        let journal_opts =
            fsys::JournalOptions::new().write_lifetime_hint(Some(fsys::WriteLifetimeHint::Long));
        let journal = fs
            .journal_with(&path, journal_opts)
            .map_err(|err| Error::Io(std::io::Error::other(format!("fsys journal: {err}"))))?;
        let journal = Arc::new(journal);

        // Open a read-only File handle for the mmap path. This
        // is a separate fd from fsys's internal writer; both
        // handles point at the same inode.
        let read_file = OpenOptions::new()
            .read(true)
            .open(&path)
            .map_err(Error::Io)?;
        // SAFETY: the file backing the mmap is held alive by
        // `read_file` for the duration of the mapping. The mmap
        // covers the file's current size at map time. Concurrent
        // writes via fsys are safe — fsys's pwrite extends the
        // file, but the mmap region stays mapped to its original
        // range. We re-mmap whenever the journal grows past the
        // current mapping; readers holding old `Arc<Mmap>`
        // snapshots continue to read from the old mapping until
        // they release.
        let initial_mmap = unsafe { Mmap::map(&read_file)? };
        let mmap_len = initial_mmap.len() as u64;

        Ok(Self {
            path,
            journal,
            fs,
            read_file: Mutex::new(read_file),
            mmap: RwLock::new(Arc::new(initial_mmap)),
            mmap_len: CachePadded::new(AtomicU64::new(mmap_len)),
            policy,
            meta: Arc::new(RwLock::new(meta)),
        })
    }

    /// On-disk path of the journal file.
    pub(crate) fn path(&self) -> &Path {
        &self.path
    }

    /// Read a snapshot of the meta sidecar.
    pub(crate) fn header(&self) -> Result<MetaHeader> {
        Ok(*self.meta.read())
    }

    /// Logical end-of-data byte offset within the journal file.
    /// Equivalent to "the size of the file at the moment of this
    /// call", since fsys's journal is append-only and never
    /// pre-allocates past the actual data.
    pub(crate) fn tail(&self) -> u64 {
        self.journal.next_lsn().as_u64()
    }

    /// Borrow a snapshot of the current read mapping.
    ///
    /// Always returns the existing mapping without re-mmapping —
    /// the cost of `Mmap::map` per call would blow up read-heavy
    /// workloads that interleave with writes (each write extends
    /// the journal; an aggressive refresh would re-map per call).
    ///
    /// Callers that need bytes past the current mapping (because
    /// the journal grew since the mapping was last refreshed)
    /// should call [`Self::mmap_covering`] with the byte offset
    /// they need to read past — that path triggers a refresh
    /// only when the requested offset is past the current
    /// mapping's end.
    pub(crate) fn mmap(&self) -> Result<Arc<Mmap>> {
        Ok(Arc::clone(&self.mmap.read()))
    }

    /// Borrow a read mapping that covers at least up to byte
    /// `end_offset`. If the current mapping already covers it,
    /// returns immediately. Otherwise refreshes the mmap once,
    /// then returns.
    ///
    /// This is the read API for callers that know the byte
    /// offset they want to access: pass `offset + 1` (or
    /// `offset + record_size`) and the call refreshes on demand
    /// rather than on every write.
    pub(crate) fn mmap_covering(&self, end_offset: u64) -> Result<Arc<Mmap>> {
        let cur_len = self.mmap_len.load(Ordering::Acquire);
        if end_offset > cur_len {
            self.refresh_mmap()?;
        }
        Ok(Arc::clone(&self.mmap.read()))
    }

    /// Append a payload to the journal. Returns the byte offset
    /// of the payload's first byte within the journal file —
    /// this is what the engine stores in its in-memory index.
    ///
    /// Bytes 0..N of `payload` become the bytes
    /// `[payload_start..payload_start + N]` of the journal file
    /// (visible via [`Self::mmap`] after this call returns).
    /// The fsys frame's magic + length prefix and trailing CRC
    /// surround the payload but are invisible to the index.
    ///
    /// Under [`FlushPolicy::WriteThrough`] the call also
    /// `sync_through`s the new tail, so the bytes are durable
    /// on stable storage before this returns.
    pub(crate) fn append(&self, payload: &[u8]) -> Result<u64> {
        let payload_len = payload.len() as u64;
        let end_lsn = self
            .journal
            .append(payload)
            .map_err(|err| Error::Io(std::io::Error::other(format!("fsys append: {err}"))))?
            .as_u64();
        let payload_start = end_lsn - FSYS_POST_PAYLOAD_BYTES - payload_len;

        // The mmap is *not* refreshed here. Writes stay on the
        // hot lock-free path; the read-side `mmap()` accessor
        // detects when the journal has grown past the current
        // mapping and refreshes lazily (one remap per "read
        // after a write burst" instead of one per append).

        // Under WriteThrough policy, sync immediately so the
        // bytes are durable before this call returns.
        if matches!(self.policy, FlushPolicy::WriteThrough) {
            self.journal
                .sync_through(fsys::Lsn::new(end_lsn))
                .map_err(|err| Error::Io(std::io::Error::other(format!("fsys sync: {err}"))))?;
        }

        Ok(payload_start)
    }

    /// Closure-style append. Allocates a small `Vec<u8>` per
    /// call, hands it to `fill_payload` so the caller can encode
    /// the tag byte + body in place, then routes through
    /// [`Self::append`]. Convenience for engine call sites that
    /// want the v0.7-v0.8 closure shape.
    pub(crate) fn append_with<F>(&self, fill_payload: F) -> Result<u64>
    where
        F: FnOnce(&mut Vec<u8>) -> Result<()>,
    {
        let mut buf = Vec::with_capacity(64);
        fill_payload(&mut buf)?;
        self.append(&buf)
    }

    /// Closure-style batch append. The closure is given a
    /// `&mut Vec<Vec<u8>>` it can fill with one entry per
    /// record. After the closure returns, every entry is
    /// appended via [`Self::append`] in order; the per-record
    /// payload-start offsets are returned in the same order.
    pub(crate) fn append_batch_with<F>(&self, fill: F) -> Result<Vec<u64>>
    where
        F: FnOnce(&mut Vec<Vec<u8>>) -> Result<()>,
    {
        let mut payloads: Vec<Vec<u8>> = Vec::new();
        fill(&mut payloads)?;
        let slices: Vec<&[u8]> = payloads.iter().map(|v| v.as_slice()).collect();
        self.append_batch(slices)
    }

    /// Append a batch of payloads under a single vectored
    /// submission. Returns the per-payload start offsets in the
    /// same order, matching the input.
    ///
    /// Routes through `fsys::JournalHandle::append_batch` so the
    /// whole batch lands in **one** LSN reservation, **one**
    /// heap allocation for the concatenated frames, and **one**
    /// platform `pwrite` syscall — materially faster than the
    /// pre-0.9.1 "loop of `append` calls" shape on bulk-load
    /// workloads (the dominant `insert_many` / transaction
    /// commit / compaction shape).
    ///
    /// Per-record start offsets are derived from the returned
    /// end-of-batch LSN by walking the layout: each record
    /// contributes `FSYS_FRAME_OVERHEAD + payload.len()` bytes,
    /// and the payload begins `FSYS_PRE_PAYLOAD_BYTES` after the
    /// frame header.
    pub(crate) fn append_batch<'a, I>(&self, payloads: I) -> Result<Vec<u64>>
    where
        I: IntoIterator<Item = &'a [u8]>,
    {
        let payloads: Vec<&[u8]> = payloads.into_iter().collect();
        if payloads.is_empty() {
            return Ok(Vec::new());
        }

        let end_lsn = self
            .journal
            .append_batch(&payloads)
            .map_err(|err| Error::Io(std::io::Error::other(format!("fsys append_batch: {err}"))))?
            .as_u64();

        // Reconstruct per-payload start offsets from the batch's
        // end LSN. Frames are appended back-to-back, so the
        // start of frame `i` is end_of_batch minus the cumulative
        // frame size from `i` onward; the payload sits
        // FSYS_PRE_PAYLOAD_BYTES into the frame.
        let total_frame_size: u64 = payloads
            .iter()
            .map(|p| FSYS_FRAME_OVERHEAD + p.len() as u64)
            .sum();
        let batch_start = end_lsn - total_frame_size;
        let mut starts = Vec::with_capacity(payloads.len());
        let mut cursor = batch_start;
        for payload in &payloads {
            starts.push(cursor + FSYS_PRE_PAYLOAD_BYTES);
            cursor += FSYS_FRAME_OVERHEAD + payload.len() as u64;
        }

        // mmap refresh is lazy — see [`Self::mmap`] for the
        // strategy. We do not refresh on the write path.

        if matches!(self.policy, FlushPolicy::WriteThrough) {
            self.journal
                .sync_through(fsys::Lsn::new(end_lsn))
                .map_err(|err| Error::Io(std::io::Error::other(format!("fsys sync: {err}"))))?;
        }

        Ok(starts)
    }

    /// Force pending writes durable to stable storage.
    ///
    /// Calls `fsys::JournalHandle::sync_through(next_lsn)` —
    /// fsys coalesces concurrent sync requests internally, so
    /// callers under N threads that all call `flush()` at once
    /// see exactly one `fdatasync` (or NVMe passthrough flush)
    /// covering everyone's writes.
    ///
    /// Under [`FlushPolicy::WriteThrough`], appends are already
    /// durable on return, so `flush()` is a near-free
    /// "sync the empty tail" call.
    pub(crate) fn flush(&self) -> Result<()> {
        let target = self.journal.next_lsn();
        self.journal
            .sync_through(target)
            .map_err(|err| Error::Io(std::io::Error::other(format!("fsys sync: {err}"))))?;
        Ok(())
    }

    /// Persist the meta sidecar (headers + flags + encryption
    /// metadata). Used on graceful drop and on encryption
    /// metadata changes.
    pub(crate) fn persist_meta(&self) -> Result<()> {
        let header = *self.meta.read();
        meta::write_with(&self.fs, &self.path, &header)?;
        Ok(())
    }

    /// Update the meta sidecar's encryption metadata (salt +
    /// verification block) and persist atomically.
    ///
    /// Used on the first open of an encrypted database (writes
    /// the verification block) and as part of key rotation.
    pub(crate) fn set_encryption_metadata(
        &self,
        salt: [u8; meta::META_SALT_LEN],
        verify: [u8; meta::META_VERIFY_LEN],
    ) -> Result<()> {
        {
            let mut guard = self.meta.write();
            guard.encryption_salt = salt;
            guard.encryption_verify = verify;
            guard.flags |= meta::FLAG_ENCRYPTED;
        }
        self.persist_meta()
    }

    /// Atomically replace the journal file with `replacement_path`.
    /// Used by compaction.
    ///
    /// Sequence:
    /// 1. Drop our own read-mmap and read-file handles so
    ///    Windows allows the rename.
    /// 2. Close the existing journal (final sync + drop).
    /// 3. Atomic-rename the replacement file over the canonical
    ///    path.
    /// 4. Reopen the journal on the new path; refresh the
    ///    read-mmap.
    ///
    /// Old `Arc<Mmap>` snapshots held by readers stay valid
    /// through the swap on every supported OS — Linux/macOS keep
    /// the original inode alive while a mapping references it,
    /// and Windows holds the file via the mapping handle.
    pub(crate) fn swap_underlying(self: &Arc<Self>, replacement_path: &Path) -> Result<()> {
        // Close the journal before renaming. We need exclusive
        // ownership of the journal handle, which means no other
        // Arc<Self> clones can be in flight. Caller (Engine) is
        // responsible for ensuring single-ownership at swap time.
        //
        // We can't actually close the journal here because we
        // hold an `Arc<JournalHandle>`. Instead, drop our own
        // strong reference to the journal by replacing it with
        // a journal pointing at the new file.

        // Lock the mmap so no concurrent readers grab the old
        // Arc while we swap.
        let mut mmap_guard = self.mmap.write();
        let mut file_guard = self.read_file.lock();

        // Drop the old read-file by replacing it with a placeholder
        // pointing at the replacement. Windows requires the file
        // to not be held when we rename over it.
        let placeholder = OpenOptions::new()
            .read(true)
            .open(replacement_path)
            .map_err(Error::Io)?;
        let _old_read_file = std::mem::replace(&mut *file_guard, placeholder);
        drop(_old_read_file);

        // Atomic rename: replacement → original path.
        std::fs::rename(replacement_path, &self.path).map_err(Error::Io)?;

        // Reopen our read-file on the new path.
        let new_read_file = OpenOptions::new()
            .read(true)
            .open(&self.path)
            .map_err(Error::Io)?;
        *file_guard = new_read_file;

        // Re-mmap from the new file.
        // SAFETY: same invariants as the initial mmap in
        // `open_with_policy` — file held alive by `file_guard`,
        // mapping covers the whole file at map time.
        let new_mmap = unsafe { Mmap::map(&*file_guard)? };
        let new_len = new_mmap.len() as u64;
        *mmap_guard = Arc::new(new_mmap);
        self.mmap_len.store(new_len, Ordering::Release);

        Ok(())
    }

    /// Refresh the mmap from the read-file's current state.
    /// Called when the journal extends past the current mapping
    /// (post-append) or after `swap_underlying` wires up a new
    /// file.
    fn refresh_mmap(&self) -> Result<()> {
        let file_guard = self.read_file.lock();
        // SAFETY: same invariants as the initial mmap in
        // `open_with_policy` — `file_guard` keeps the fd alive
        // for the duration of the mapping; the mapping covers
        // the file's current size at map time. Concurrent
        // writers via fsys are safe — pwrite extends the file
        // but the mapping stays valid for its mapped range.
        let new_mmap = unsafe { Mmap::map(&*file_guard)? };
        let new_len = new_mmap.len() as u64;
        drop(file_guard);
        let mut mmap_guard = self.mmap.write();
        *mmap_guard = Arc::new(new_mmap);
        self.mmap_len.store(new_len, Ordering::Release);
        Ok(())
    }

    /// Run a fresh `fsys::JournalReader` over the on-disk journal.
    /// Used by [`crate::storage::engine::Engine::recovery_scan`]
    /// to walk records and populate the in-memory index.
    pub(crate) fn open_reader(&self) -> Result<fsys::JournalReader> {
        fsys::JournalReader::open(&self.path)
            .map_err(|err| Error::Io(std::io::Error::other(format!("fsys reader: {err}"))))
    }

    /// fsys top-level handle, exposed for atomic-replace meta
    /// sidecar writes by the engine's encryption-admin path.
    pub(crate) fn fs(&self) -> &fsys::Handle {
        &self.fs
    }

    /// Helpers for callers that need to reason about fsys frame
    /// geometry (e.g. computing the byte range a record's
    /// payload occupies on disk given its `payload_start`).
    pub(crate) const fn frame_overhead() -> u64 {
        FSYS_FRAME_OVERHEAD
    }
    pub(crate) const fn pre_payload_bytes() -> u64 {
        FSYS_PRE_PAYLOAD_BYTES
    }
}

impl Drop for Store {
    fn drop(&mut self) {
        // Best-effort: persist the meta sidecar one more time on
        // graceful drop in case it changed in flight (encryption
        // metadata changes during the lifetime of the handle).
        // Errors are swallowed because Drop cannot return them.
        let _ = self.persist_meta();
    }
}