fsys 1.1.0 - Docs.rs

//! Write-ahead-log / journal substrate (0.8.0).
//!
//! A journal is an append-only log file with explicit
//! group-commit durability semantics. Unlike `Handle::write`'s
//! atomic-replace pattern (5–7 syscalls per write, fsync per
//! call), a journal is opened *once*, appends are issued without
//! per-call fsync, and durability is established explicitly via
//! [`JournalHandle::sync_through`]. This is the primitive every
//! serious database storage engine uses for the WAL — it gets
//! you to millions of durable writes/sec.
//!
//! ## When to use the journal
//!
//! Use the journal when:
//!
//! - You're building a database, queue, or ledger that needs
//!   high-throughput durable writes with **commit-LSN
//!   semantics** (durability is a sequence point, not a
//!   per-write flush).
//! - You need **group-commit batching** — N appends amortised
//!   across one fsync.
//! - You've already opened the target file conceptually (you're
//!   appending to a single log, not replacing N files).
//!
//! Use [`Handle::write`](crate::Handle::write) instead when:
//!
//! - You need atomic-replace semantics (the file is either
//!   entirely the old payload or entirely the new payload at
//!   every observable point).
//! - You're updating individual files (config files, package
//!   manifests, document state).
//! - Throughput is not the dominant concern; correctness +
//!   atomicity are.
//!
//! ## Concurrency
//!
//! [`JournalHandle`] is `Send + Sync` and can be shared across
//! threads via [`std::sync::Arc`]. Concurrent appends from
//! multiple threads serialise through an atomic LSN reservation
//! (no mutex on the append path); the underlying `pwrite` calls
//! are concurrent-safe per POSIX. Concurrent calls to
//! [`sync_through`](JournalHandle::sync_through) are
//! group-committed: only one `fsync` syscall runs at a time;
//! all callers waiting for an LSN ≤ the synced frontier wake
//! immediately when the in-flight fsync completes.
//!
//! ## LSN model
//!
//! [`Lsn`] is the byte-offset of the *next* write position
//! after a record. So if you append a 100-byte record starting
//! at offset 1000, [`append`](JournalHandle::append) returns
//! `Lsn(1100)`. Calling
//! [`sync_through(Lsn(1100))`](JournalHandle::sync_through)
//! ensures every byte from offset 0 through 1099 is on stable
//! storage.
//!
//! LSNs are monotonic per-handle. They reset to `Lsn(0)` only
//! when the underlying file is truncated or recreated.

pub mod backend;
pub(crate) mod format;
pub(crate) mod log_buffer;
pub mod options;
pub mod reader;

pub use backend::{JournalBackend, JournalBackendHealth, JournalBackendInfo, JournalBackendKind};
pub use options::{JournalOptions, SyncMode, WriteLifetimeHint};
pub use reader::{JournalIter, JournalReader, JournalRecord, JournalTailState};

use crate::{Error, Result};
use crossbeam_utils::CachePadded;
use log_buffer::LogBuffer;
use parking_lot::{Condvar, Mutex as PlMutex};
use std::fs::{File, OpenOptions};
use std::io::Seek;
use std::path::Path;
use std::sync::atomic::{AtomicU64, Ordering};
use std::time::{Duration, Instant};

/// Threshold for the stack-allocated frame fast path on the
/// buffered-mode append hot path. Frames whose total size
/// (payload + 12-byte overhead) is at most this value are
/// encoded into a stack array; larger frames fall back to a
/// heap allocation. 2 KiB covers virtually every real-world
/// WAL record (typical sizes are 64 B – 1 KiB) while keeping
/// the per-call stack footprint bounded.
///
/// The exact value is internal — choosing a different size is
/// purely a performance tuning. Records larger than this still
/// encode and append correctly via the heap fallback.
const STACK_FRAME_THRESHOLD: usize = 2048;

/// Log sequence number — byte-offset of the next write position
/// in the journal's underlying file.
///
/// Returned by [`JournalHandle::append`]; consumed by
/// [`JournalHandle::sync_through`]. Monotonic per-handle,
/// transparent ordering (`Lsn(100) < Lsn(200)` ⟺ the first
/// record was appended before the second).
///
/// `Lsn::ZERO` is the start-of-journal sentinel — equivalent to
/// "nothing has been appended yet."
///
/// # Construction
///
/// The inner byte offset is **private** to preserve the monotonic
/// invariant (LSNs may only be minted by the journal; external
/// mutation via `lsn.0 = ...` is intentionally forbidden). Construct
/// from a raw `u64` via [`Lsn::new`] or [`From<u64>`] when forwarding
/// a persisted offset back through [`JournalHandle::sync_through`]:
///
/// ```
/// use fsys::Lsn;
/// let lsn = Lsn::new(1100);
/// let same: Lsn = 1100u64.into();
/// assert_eq!(lsn, same);
/// ```
#[derive(Copy, Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Lsn(u64);

impl Lsn {
    /// The start-of-journal sentinel. Equivalent to `Lsn::new(0)` and
    /// `Lsn::default()`.
    pub const ZERO: Lsn = Lsn(0);

    /// Constructs an `Lsn` from a raw byte offset.
    ///
    /// Typically only the journal mints LSNs; callers receive them
    /// from [`JournalHandle::append`] and forward them back through
    /// [`JournalHandle::sync_through`]. This constructor exists for
    /// the round-trip case where the LSN was persisted externally
    /// (e.g., in a downstream index) and needs to be reconstructed.
    #[must_use]
    #[inline]
    pub const fn new(offset: u64) -> Self {
        Self(offset)
    }

    /// Returns the LSN's underlying byte offset.
    #[must_use]
    #[inline]
    pub const fn as_u64(self) -> u64 {
        self.0
    }
}

impl From<u64> for Lsn {
    #[inline]
    fn from(offset: u64) -> Self {
        Self(offset)
    }
}

impl From<Lsn> for u64 {
    #[inline]
    fn from(lsn: Lsn) -> Self {
        lsn.0
    }
}

impl std::fmt::Display for Lsn {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "Lsn({})", self.0)
    }
}

/// Append-only log file with explicit group-commit durability.
///
/// Open via [`Handle::journal`](crate::Handle::journal). Share
/// across threads via [`Arc`](std::sync::Arc).
///
/// # Example
///
/// ```no_run
/// use std::sync::Arc;
/// use fsys::builder;
///
/// # fn main() -> fsys::Result<()> {
/// let fs = builder().build()?;
/// let log = Arc::new(fs.journal("/var/log/app.wal")?);
///
/// // Append several records — no fsync, no syscall amplification.
/// let _lsn1 = log.append(b"record 1")?;
/// let _lsn2 = log.append(b"record 2")?;
/// let lsn3 = log.append(b"record 3")?;
///
/// // Group-commit fsync — one syscall covers all three appends.
/// log.sync_through(lsn3)?;
/// # Ok(())
/// # }
/// ```
pub struct JournalHandle {
    /// The underlying file — held *unwrapped*, no `Mutex` on the
    /// append hot path. `std::fs::File` is `Send + Sync`, and
    /// platform `pwrite` (Linux/macOS) / `WriteFile` with offset
    /// (Windows) is concurrent-safe per call when each call writes
    /// to a distinct offset (which is the LSN-reservation
    /// invariant). 0.8.0 R-1 tier-2: removed the `Mutex<File>`
    /// from the append path; concurrent appends from N threads
    /// no longer serialise through a lock.
    pub(crate) file: File,
    /// Highest reserved LSN. Atomically advanced by [`Self::append`]
    /// via `fetch_add(record.len())`; never decreases.
    ///
    /// 0.9.1: cache-padded ([`CachePadded`]) so the appender
    /// hot path's `fetch_add` doesn't ping-pong with
    /// [`Self::synced_lsn`]'s reads from group-commit followers.
    /// On a 64-byte cache line, the previous `(AtomicU64,
    /// AtomicU64)` pair shared a single line and produced a
    /// MESI invalidate every time a sync completed during high
    /// append load. Padding both members eliminates that
    /// false-sharing class entirely.
    pub(crate) next_lsn: CachePadded<AtomicU64>,
    /// Highest LSN known durable on stable storage. Updated by
    /// [`Self::sync_through`] after a successful fsync; never decreases.
    /// See `next_lsn` for the cache-padding rationale.
    pub(crate) synced_lsn: CachePadded<AtomicU64>,
    /// 0.9.1 — leader/follower group-commit coordinator. Replaces
    /// the pre-0.9.1 `sync_gate: Mutex<()>` blocking gate. The
    /// first thread to call [`Self::sync_through`] becomes the
    /// **leader**; concurrent callers become **followers** that
    /// park on a [`parking_lot::Condvar`] and wake when the
    /// leader's fsync completes. The leader optionally waits
    /// [`JournalOptions::group_commit_window`] for additional
    /// followers to enqueue, exiting early once
    /// [`JournalOptions::group_commit_max_batch`] have joined.
    /// Ports the v0.8.5 emdb scheme that achieved 8× aggregate
    /// throughput vs unbatched per-flush mode.
    pub(crate) group_commit: GroupCommit,
    /// Path the journal was opened at — for diagnostics / errors.
    /// Not used on the hot path.
    #[allow(dead_code)]
    path: std::path::PathBuf,
    /// Lazy native io_uring substrate — Linux + `async` feature
    /// only. Constructed on first `append_async` /
    /// `sync_through_async` call inside a tokio runtime context;
    /// `Some(None)` after a construction failure (e.g. io_uring
    /// unavailable) so subsequent async calls fall back to
    /// `spawn_blocking` without retrying. Tier-3 (R-1 follow-up):
    /// when populated, async appends submit `IORING_OP_WRITE`
    /// SQEs and `sync_through_async` submits
    /// `IORING_OP_FSYNC(DATASYNC)` SQEs through the same ring,
    /// eliminating the `spawn_blocking` thread-pool hop.
    #[cfg(all(target_os = "linux", feature = "async"))]
    pub(crate) native_ring: std::sync::OnceLock<
        Option<std::sync::Arc<crate::async_io::completion_driver::AsyncIoUring>>,
    >,
    /// Direct-IO mode flag. `true` when the journal was opened with
    /// [`JournalOptions::direct(true)`]. Determines whether the
    /// append/sync paths route through [`Self::log_buffer`] (mutex-
    /// serialised log-buffer pattern) or the lock-free `pwrite`
    /// path used by buffered-mode journals.
    pub(crate) direct: bool,
    /// In-memory sector-aligned log buffer. `Some(_)` exclusively
    /// when `direct = true`; `None` otherwise. Mutex-protected
    /// because direct-mode appends serialise into a single shared
    /// buffer (the InnoDB / WiredTiger pattern). Buffered-mode
    /// journals retain their lock-free fast path.
    ///
    /// 0.9.7 H-2 — private (not `pub(crate)`): only accessed
    /// from within `src/journal/mod.rs`. Demoted so future
    /// refactors can change the field shape without breaking
    /// callers in other modules.
    log_buffer: Option<LogBuffer>,
    /// 0.9.2 — optional structured-telemetry observer cloned in
    /// from the parent [`crate::Handle`] at journal-open time.
    /// `None` for journals on observer-less handles. Per-op cost
    /// when `None`: a single `Option::is_some` branch.
    ///
    /// 0.9.7 H-2 — private. Set via [`Self::set_observer`].
    observer: Option<std::sync::Arc<dyn crate::observer::FsysObserver>>,
    /// 0.9.4 — durability primitive choice for `sync_through`.
    /// `SyncMode::Full` (default) calls
    /// `file.sync_data()` (the platform's full media-durability
    /// primitive); `SyncMode::Barrier` calls
    /// `platform::sync_barrier()` (cheaper on macOS with PLP).
    /// Captured at journal-open time from
    /// `JournalOptions::sync_mode`.
    ///
    /// 0.9.7 H-2 — private (not `pub(crate)`): only consulted
    /// inside `do_sync_locked` / `sync_through` in this module.
    sync_mode: options::SyncMode,
}

impl JournalHandle {
    /// Opens the journal at `path` for append using default
    /// [`JournalOptions`]. Equivalent to
    /// [`Self::open_with_options`] with `JournalOptions::default()`.
    ///
    /// Called via [`Handle::journal`] — `pub(crate)` because
    /// the public entry point lives on [`Handle`] for path-root
    /// resolution.
    pub(crate) fn open(path: &Path) -> Result<Self> {
        Self::open_with_options(path, JournalOptions::default())
    }

    /// Opens the journal at `path` honoring `options`.
    ///
    /// **Buffered mode** (`options.direct == false`, the default):
    /// the file is opened via standard `OpenOptions`, the
    /// lock-free LSN reservation + concurrent `pwrite` path is
    /// active, and resume sets `next_lsn` to the existing file
    /// size.
    ///
    /// **Direct mode** (`options.direct == true`): the file is
    /// opened with the platform's Direct-IO flag (`O_DIRECT` /
    /// `F_NOCACHE` / `FILE_FLAG_NO_BUFFERING`). An in-memory
    /// sector-aligned log buffer is allocated; appends serialise
    /// into the buffer (mutex-protected) and flush in
    /// sector-aligned chunks. Resume scans the existing file
    /// to find the LSN immediately past the last cleanly-decoded
    /// frame and resumes there — partial trailing sector content
    /// is rehydrated into the buffer so subsequent flushes
    /// overwrite the zero-pad cleanly.
    pub(crate) fn open_with_options(path: &Path, options: JournalOptions) -> Result<Self> {
        if options.direct {
            Self::open_direct(path, options)
        } else {
            Self::open_buffered(path, options)
        }
    }

    /// Buffered-mode constructor. Lock-free append, lock-free
    /// LSN reservation, group-commit fsync. This is the default
    /// path when [`JournalOptions::direct`] is not set.
    ///
    /// 0.9.1: now accepts `JournalOptions` so it can honour the
    /// `group_commit_window` and `group_commit_max_batch` knobs.
    /// Pre-0.9.1 buffered journals always used `Mutex<()>` with
    /// no batching window; the new default (`Some(500 µs) / 8`)
    /// matches emdb v0.8.5's coordinator, which empirically
    /// produced an 8× aggregate-throughput win on 8-thread
    /// per-record-flush workloads.
    fn open_buffered(path: &Path, options: JournalOptions) -> Result<Self> {
        let mut file = OpenOptions::new()
            .read(true)
            .write(true)
            .create(true)
            .truncate(false)
            .open(path)
            .map_err(Error::Io)?;

        // 0.9.4 — apply the optional NVMe write-lifetime hint
        // on Linux. Failure is non-fatal (older kernels, drives
        // without multi-stream, filesystems that reject the
        // fcntl) — the hint is advisory.
        Self::apply_write_lifetime_hint(&file, options.write_lifetime_hint);

        // Resume: next_lsn = current file length. Seek to end so
        // that any sneaky `write()` (which we don't use, but
        // belt-and-braces) lands at the right place.
        let len = file.seek(std::io::SeekFrom::End(0)).map_err(Error::Io)?;

        Ok(Self {
            file,
            next_lsn: CachePadded::new(AtomicU64::new(len)),
            synced_lsn: CachePadded::new(AtomicU64::new(len)),
            group_commit: GroupCommit::new(
                options.group_commit_window,
                options.group_commit_max_batch,
                len,
            ),
            path: path.to_path_buf(),
            #[cfg(all(target_os = "linux", feature = "async"))]
            native_ring: std::sync::OnceLock::new(),
            direct: false,
            log_buffer: None,
            observer: None,
            sync_mode: options.sync_mode,
        })
    }

    /// Direct-mode constructor. Opens the file with the platform's
    /// `O_DIRECT` analog and allocates the sector-aligned log
    /// buffer. On reopen, scans the existing file for the
    /// last-clean LSN and rehydrates the partial trailing sector
    /// into the buffer.
    /// 0.9.4 — Applies the optional NVMe write-lifetime hint to
    /// the journal file. Delegates to
    /// [`crate::platform::set_write_lifetime_hint`] which is a
    /// real `fcntl(F_SET_RW_HINT)` on Linux and a no-op on every
    /// other platform. No-op when `hint` is `None`. Failure to
    /// set the hint (older kernel, FS rejection, drive without
    /// multi-stream support) is silently ignored — the hint is
    /// advisory; missing it costs at most some NAND
    /// garbage-collection efficiency, never correctness.
    fn apply_write_lifetime_hint(file: &File, hint: Option<options::WriteLifetimeHint>) {
        if let Some(h) = hint {
            let ordinal: u8 = match h {
                options::WriteLifetimeHint::Short => 0,
                options::WriteLifetimeHint::Medium => 1,
                options::WriteLifetimeHint::Long => 2,
                options::WriteLifetimeHint::Extreme => 3,
            };
            let _ = crate::platform::set_write_lifetime_hint(file, ordinal);
        }
    }

    fn open_direct(path: &Path, options: JournalOptions) -> Result<Self> {
        // Resolve the resume cursor by scanning the existing file
        // (if any) for the last cleanly-decoded frame's end LSN.
        let resume_lsn = if path.exists() {
            scan_clean_end(path)?
        } else {
            0
        };

        let sector_size = crate::platform::probe_sector_size(path);
        // Open the journal file with the platform's Direct-IO
        // flag. `open_direct_journal` returns
        // `(file, direct_active)`; `direct_active = false` means
        // the filesystem rejected the flag and we silently fell
        // back to a buffered handle (still functional, observable
        // via `is_direct_active`).
        let (file, direct_active) = open_direct_journal(path, sector_size)?;

        // 0.9.4 — apply the optional NVMe write-lifetime hint.
        // Same non-fatal-on-failure contract as the buffered
        // constructor.
        Self::apply_write_lifetime_hint(&file, options.write_lifetime_hint);

        // If Direct-IO was rejected by the filesystem
        // (`open_direct_journal` returned `direct_active = false`),
        // fall back to the buffered path. We do NOT silently lose
        // the "direct" intent — the caller can observe via
        // [`Self::is_direct_active`].
        let log_buffer = if direct_active {
            // Allocate the log buffer. Resume puts `flush_pos` at
            // the largest sector boundary ≤ resume_lsn; the buffer
            // is primed with the partial trailing sector content
            // (so subsequent flushes overwrite the zero-pad
            // cleanly).
            // 0.9.5: `log_buffer_kib` is now PER SLOT (each of
            // the two buffer slots in the dual-buffer
            // implementation). Total memory: 2 × cap_bytes.
            let cap_bytes = options.log_buffer_kib.saturating_mul(1024);
            let buf = LogBuffer::new(cap_bytes, sector_size, 0)?;
            if resume_lsn > 0 {
                rehydrate_log_buffer(&buf, &file, sector_size, resume_lsn)?;
            }
            Some(buf)
        } else {
            None
        };

        Ok(Self {
            file,
            next_lsn: CachePadded::new(AtomicU64::new(resume_lsn)),
            synced_lsn: CachePadded::new(AtomicU64::new(resume_lsn)),
            group_commit: GroupCommit::new(
                options.group_commit_window,
                options.group_commit_max_batch,
                resume_lsn,
            ),
            path: path.to_path_buf(),
            #[cfg(all(target_os = "linux", feature = "async"))]
            native_ring: std::sync::OnceLock::new(),
            direct: direct_active,
            log_buffer,
            observer: None,
            sync_mode: options.sync_mode,
        })
    }

    /// 0.9.2 — installs the structured-telemetry observer on this
    /// journal handle. Called by [`crate::Handle::journal`] /
    /// [`crate::Handle::journal_with`] right after opening, with
    /// the parent handle's observer (if any).
    ///
    /// Idempotent: calling twice replaces any previously installed
    /// observer with the new one.
    pub(crate) fn set_observer(
        &mut self,
        observer: Option<std::sync::Arc<dyn crate::observer::FsysObserver>>,
    ) {
        self.observer = observer;
    }

    /// Returns `true` when this journal is using the Direct-IO
    /// log-buffer path. Returns `false` for buffered-mode journals
    /// AND for journals that requested direct mode but had it
    /// silently downgraded to buffered (filesystem rejected
    /// `O_DIRECT`).
    #[must_use]
    #[inline]
    pub fn is_direct_active(&self) -> bool {
        self.direct
    }

    /// Appends `record` to the journal and returns the LSN
    /// immediately after this record (i.e. the next-write position).
    ///
    /// Does **not** fsync. Call [`Self::sync_through`] explicitly when
    /// durability is required. The journal's value proposition is
    /// exactly this separation: many cheap appends amortised across one
    /// fsync at a transaction boundary.
    ///
    /// # Concurrency
    ///
    /// Multiple threads may call `append` concurrently against the
    /// same `Arc<JournalHandle>`. The LSN reservation is a single
    /// `AtomicU64::fetch_add` (no mutex on the hot path); the
    /// underlying `pwrite` calls are concurrent-safe per POSIX
    /// for typical record sizes. For sub-page records (≤ 4 KiB
    /// typically) `pwrite` is atomic per call; larger records
    /// are looped on partial writes inside the platform layer.
    ///
    /// For bulk-load patterns (N records committed together), prefer
    /// [`Self::append_batch`] — single LSN reservation, single syscall,
    /// ~1.6× per-record reduction over `append`-in-loop.
    ///
    /// # Errors
    ///
    /// - [`Error::Io`] on the underlying write failure.
    /// - [`Error::Io`] with `InvalidInput` if the record exceeds
    ///   `FRAME_MAX_PAYLOAD` (256 MiB).
    ///
    /// # Examples
    ///
    /// ```no_run
    /// use fsys::builder;
    ///
    /// # fn example() -> fsys::Result<()> {
    /// let fs = builder().build()?;
    /// let log = fs.journal("/var/lib/myapp/log.wal")?;
    ///
    /// // Append three records — no fsync, no syscall amplification.
    /// let _ = log.append(b"txn 1: insert k=v")?;
    /// let _ = log.append(b"txn 2: update k=v'")?;
    /// let lsn = log.append(b"txn 3: commit")?;
    ///
    /// // One fsync covers every prior append.
    /// log.sync_through(lsn)?;
    /// # Ok(())
    /// # }
    /// ```
    pub fn append(&self, record: &[u8]) -> Result<Lsn> {
        #[cfg(feature = "tracing")]
        let _span = tracing::trace_span!(
            "fsys::journal::append",
            payload_bytes = record.len(),
            direct = self.direct,
        )
        .entered();
        // 0.9.2: observer instrumentation. `Option::as_ref` is a
        // single branch when no observer is registered, and the
        // `Instant::now()` call is elided by the compiler in that
        // case (gated on `obs.is_some()`).
        // 0.9.6 audit M-1: single-deref Option pattern.
        // Pre-0.9.6 this site dereffed `self.observer` twice
        // (once for `Instant::now()` gating, once for the
        // event fire) — collapsed to one match-and-bind here,
        // saving a redundant Option deref + Instant::now() on
        // the failure path of `append_inner`.
        let obs_ref = self.observer.as_ref();
        let obs_start = obs_ref.map(|_| Instant::now());
        let result = self.append_inner(record);
        if let (Some(obs), Some(start)) = (obs_ref, obs_start) {
            let bytes = record.len() as u64 + format::FRAME_OVERHEAD as u64;
            obs.on_journal_append(crate::observer::JournalAppendEvent {
                bytes_written: bytes,
                records: 1,
                duration: start.elapsed(),
                error: result.is_err(),
            });
        }
        result
    }

    fn append_inner(&self, record: &[u8]) -> Result<Lsn> {
        if let Some(log_buffer) = &self.log_buffer {
            // Direct-IO log-buffer path.
            // 0.9.5: the LogBuffer is now self-locking
            // (internal mutex + condvar). Multiple appenders
            // serialise briefly on the state lock for the copy
            // into the active slot, but the slow `write_at_direct`
            // syscall happens unlocked — appenders into the new
            // active slot proceed concurrently with the flush
            // of the dormant slot.
            let (_start, end) = log_buffer.append_frame(&self.file, record)?;
            self.next_lsn.store(end, Ordering::Release);
            #[cfg(feature = "tracing")]
            tracing::trace!(end_lsn = end, "direct append complete");
            return Ok(Lsn(end));
        }

        // Buffered mode: the lock-free 0.8.0 path.
        //
        // Encode the frame: 12 bytes of overhead (magic +
        // length + crc32c) wrap the user's payload. Uniform
        // framing is load-bearing — even zero-length records
        // produce a 12-byte header-only frame so the reader's
        // forward-iteration invariant holds. We bounds-check the
        // record length against `FRAME_MAX_PAYLOAD` (256 MiB)
        // and the total frame size against `usize::MAX` before
        // any allocation.
        let payload_len = record.len();
        if (payload_len as u64) > (format::FRAME_MAX_PAYLOAD as u64) {
            return Err(Error::Io(std::io::Error::new(
                std::io::ErrorKind::InvalidInput,
                "journal record exceeds FRAME_MAX_PAYLOAD (256 MiB)",
            )));
        }
        let total = payload_len
            .checked_add(format::FRAME_OVERHEAD)
            .ok_or_else(|| {
                Error::Io(std::io::Error::new(
                    std::io::ErrorKind::InvalidInput,
                    "journal frame size overflow",
                ))
            })?;
        let frame_len = total as u64;

        // Reserve a slot for the entire frame. The LSN
        // semantics: caller-visible LSN is the byte offset
        // *immediately past* this frame — i.e. the start of the
        // next append. Internally, the file's byte content is
        // framed; readers using `JournalReader` walk the
        // frames forward and yield payloads.
        //
        // fetch_add is `Release` (0.9.7 M-2 — was `AcqRel`).
        //
        // The reservation step does not read any non-atomic
        // memory protected by another thread's prior Release —
        // the appender does not consult shared state set up by
        // another appender's pwrite. So the `Acquire` half of
        // the previous `AcqRel` was defensive overhead.
        //
        // `Release` IS load-bearing: the syncer's
        // `self.next_lsn.load(Ordering::Acquire)` in
        // `sync_through` synchronises-with this Release, so the
        // syncer observes the latest reserved frontier (i.e.
        // every appender's `end` value publishes through this
        // Release into the syncer's Acquire view).
        //
        // Net cost on aarch64: `fetch_add(Release)` lowers to
        // `LDADDL` (load-acquire/store-release variant LDADDL
        // emits only the store-release barrier), whereas
        // `AcqRel` emits `LDADDAL` with the additional
        // load-acquire fence. ~0.2-0.5 µs/op saved on tight
        // appender loops.
        let start = self.next_lsn.fetch_add(frame_len, Ordering::Release);
        let end = start + frame_len;

        // 0.9.1 stack-allocated frame fast path: for typical
        // WAL records (≤ STACK_FRAME_THRESHOLD-12 bytes payload,
        // i.e. ≤ 2036 bytes — covers virtually every real-world
        // WAL record), encode directly into a stack array,
        // eliminating the per-append `Vec<u8>` allocation that
        // dominates the bulk-load tight-loop profile. Records
        // larger than the threshold fall back to the heap-
        // allocated path. Lock-free hot path: pwrite directly
        // against `&self.file`; concurrent appenders write to
        // distinct offsets per the LSN-reservation invariant.
        if total <= STACK_FRAME_THRESHOLD {
            // Use `MaybeUninit` to skip the per-call zero-init
            // of an entire `[u8; STACK_FRAME_THRESHOLD]` array.
            // The encoder writes every byte of `stack[..total]`
            // before any byte is read by `write_at`. Bytes
            // `[total..STACK_FRAME_THRESHOLD]` are never read —
            // we only pass `&stack[..total]` to `write_at`.
            let mut stack: std::mem::MaybeUninit<[u8; STACK_FRAME_THRESHOLD]> =
                std::mem::MaybeUninit::uninit();
            // SAFETY: `MaybeUninit::as_mut_ptr().cast::<u8>()`
            // yields a `*mut u8` pointing at valid heap-aligned
            // stack memory of at least `STACK_FRAME_THRESHOLD`
            // bytes. The slice we construct is exactly `total`
            // bytes (≤ STACK_FRAME_THRESHOLD), so the slice is
            // contained within the allocation. `u8` has no
            // invalid bit patterns; the encoder writes every
            // byte before this slice is read.
            let stack_slice: &mut [u8] =
                unsafe { std::slice::from_raw_parts_mut(stack.as_mut_ptr().cast::<u8>(), total) };
            let _ = format::encode_frame_into(record, stack_slice)?;
            crate::platform::write_at(&self.file, start, stack_slice)?;
        } else {
            let frame = format::encode_frame_owned(record)?;
            crate::platform::write_at(&self.file, start, &frame)?;
        }

        Ok(Lsn(end))
    }

    /// Appends `records` to the journal as a single batched
    /// operation and returns the LSN immediately after the last
    /// record (i.e. the next-write position after the batch).
    ///
    /// **0.9.1 — bulk-load fast path.** This is the API every
    /// caller doing a multi-record bulk insert should use. It
    /// is materially faster than calling [`Self::append`] in a
    /// tight loop because:
    ///
    /// - **One LSN reservation** (`AtomicU64::fetch_add`) covers
    ///   the entire batch instead of N reservations contending
    ///   on the same atomic.
    /// - **One contiguous heap allocation** holds every encoded
    ///   frame, instead of N small allocations.
    /// - **One platform `pwrite` syscall** (or one log-buffer
    ///   mutex acquisition in Direct mode) submits all frames
    ///   at once, instead of N independent submissions.
    /// - **One CRC dispatch entry per record** still — frames
    ///   remain individually CRC-protected so partial-batch
    ///   crash recovery yields the longest valid prefix, the
    ///   same crash-safety contract as per-record `append`.
    ///
    /// Records inside one `append_batch` call are individually
    /// frame-protected. They are **not** transactionally atomic
    /// as a group — a crash mid-write may leave a CRC-validated
    /// prefix on disk. Callers that need all-or-nothing batch
    /// semantics must layer that on top (e.g., a transaction
    /// marker record at the end of the batch).
    ///
    /// An empty `records` slice is a no-op and returns the
    /// current next-write position.
    ///
    /// Like [`Self::append`], this method does **not** fsync.
    /// Pair with [`Self::sync_through`] for durability.
    ///
    /// # Errors
    ///
    /// - [`Error::Io`] if any record exceeds the journal's
    ///   maximum payload size (256 MiB), if the total batch
    ///   size overflows `usize`, or if the underlying write
    ///   fails.
    ///
    /// # Examples
    ///
    /// ```no_run
    /// use fsys::builder;
    ///
    /// # fn example() -> fsys::Result<()> {
    /// let fs = builder().build()?;
    /// let log = fs.journal("/var/lib/myapp/log.wal")?;
    ///
    /// // 256 records committed as one syscall.
    /// let records: Vec<Vec<u8>> = (0..256)
    ///     .map(|i| format!("txn {i}: insert").into_bytes())
    ///     .collect();
    /// let refs: Vec<&[u8]> = records.iter().map(Vec::as_slice).collect();
    /// let lsn = log.append_batch(&refs)?;
    ///
    /// // One fsync, durable batch.
    /// log.sync_through(lsn)?;
    /// # Ok(())
    /// # }
    /// ```
    pub fn append_batch(&self, records: &[&[u8]]) -> Result<Lsn> {
        #[cfg(feature = "tracing")]
        let _span = tracing::trace_span!(
            "fsys::journal::append_batch",
            record_count = records.len(),
            direct = self.direct,
        )
        .entered();
        // 0.9.6 audit M-1: single-deref Option pattern.
        let obs_ref = self.observer.as_ref();
        let obs_start = obs_ref.map(|_| Instant::now());
        let result = self.append_batch_inner(records);
        if let (Some(obs), Some(start)) = (obs_ref, obs_start) {
            let bytes = records.iter().map(|r| r.len() as u64).sum::<u64>()
                + records.len() as u64 * format::FRAME_OVERHEAD as u64;
            obs.on_journal_append(crate::observer::JournalAppendEvent {
                bytes_written: bytes,
                records: u32::try_from(records.len()).unwrap_or(u32::MAX),
                duration: start.elapsed(),
                error: result.is_err(),
            });
        }
        result
    }

    fn append_batch_inner(&self, records: &[&[u8]]) -> Result<Lsn> {
        if records.is_empty() {
            return Ok(Lsn(self.next_lsn.load(Ordering::Acquire)));
        }

        // Validate every record up-front and compute the total
        // batch encoded size. Doing this before reserving the
        // LSN slot means a malformed batch (oversize record or
        // arithmetic overflow) returns a clean error without
        // disturbing the LSN sequence.
        let mut total: usize = 0;
        for record in records {
            if (record.len() as u64) > (format::FRAME_MAX_PAYLOAD as u64) {
                return Err(Error::Io(std::io::Error::new(
                    std::io::ErrorKind::InvalidInput,
                    "journal record exceeds FRAME_MAX_PAYLOAD (256 MiB)",
                )));
            }
            let frame_size = record
                .len()
                .checked_add(format::FRAME_OVERHEAD)
                .ok_or_else(|| {
                    Error::Io(std::io::Error::new(
                        std::io::ErrorKind::InvalidInput,
                        "journal frame size overflow",
                    ))
                })?;
            total = total.checked_add(frame_size).ok_or_else(|| {
                Error::Io(std::io::Error::new(
                    std::io::ErrorKind::InvalidInput,
                    "journal append_batch total size overflow",
                ))
            })?;
        }

        if let Some(log_buffer) = &self.log_buffer {
            // Direct-IO log-buffer path.
            //
            // 0.9.6 — try the batched fast path first
            // (`try_append_frames_batched`): when the entire batch
            // fits in the active slot's remaining capacity, every
            // record is encoded + memcopied under ONE state-lock
            // acquisition. For an N-record batch on contended
            // threads this saves N-1 lock acquire/release cycles
            // (~50-100 ns each uncontended, µs each contended).
            //
            // The batched fast path returns `None` when the batch
            // doesn't fit in one shot (would require rotation or
            // includes an oversize record). In that case we fall
            // back to the per-record loop which handles rotation,
            // mid-flush waits, and the oversize-standalone path
            // correctly. 0.9.5's concurrent-flush invariants are
            // preserved on both paths.
            let last_end = match log_buffer.try_append_frames_batched(records, total)? {
                Some((_start, end)) => end,
                None => {
                    let mut last: u64 = self.next_lsn.load(Ordering::Acquire);
                    for record in records {
                        let (_start, end) = log_buffer.append_frame(&self.file, record)?;
                        last = end;
                    }
                    last
                }
            };
            self.next_lsn.store(last_end, Ordering::Release);
            #[cfg(feature = "tracing")]
            tracing::trace!(end_lsn = last_end, "direct append_batch complete");
            return Ok(Lsn::new(last_end));
        }

        // Buffered-mode batch path: one LSN reservation, one
        // contiguous heap allocation, one platform `pwrite`.
        // This is the path that recovers the bulk-load lead vs
        // v0.8.5: the per-record framing overhead now amortises
        // across N records instead of paying N independent
        // syscalls + N independent LSN-reservation atomics.
        let frame_total = total as u64;
        // `Release` (0.9.7 M-2 — was `AcqRel`). Same reasoning
        // as the single-record path at line ~604: the
        // reservation does not consult shared state set up by
        // a peer appender, so the `Acquire` half is defensive
        // overhead. The syncer's `Acquire`-load on `next_lsn`
        // synchronises-with this `Release`.
        let start = self.next_lsn.fetch_add(frame_total, Ordering::Release);
        let end = start + frame_total;

        // Allocate without zeroing — `encode_frame_into` writes
        // every byte of every frame, and `write_at` only reads
        // the first `total` bytes. Skipping the `vec![0u8; total]`
        // memset eliminates a `total`-byte zero pass on the hot
        // path; on a 5 K × 150 B WAL batch (~810 KiB) that's the
        // difference between a 0.77× regression and a 1.6× win
        // vs `append`-in-loop on the canonical sanity bench.
        //
        // `clippy::uninit_vec` warns categorically against this
        // pattern; we override per-call because the surrounding
        // encode loop establishes the must-write-before-read
        // invariant for every byte in `[0..total]`.
        #[allow(clippy::uninit_vec)]
        let mut buf: Vec<u8> = {
            let mut v: Vec<u8> = Vec::with_capacity(total);
            // SAFETY: `Vec::with_capacity(total)` reserves at
            // least `total` bytes of valid, allocator-aligned
            // heap memory; `set_len(total)` exposes those bytes
            // as `u8` (which has no invalid bit patterns and is
            // not `Drop`). Every byte in `v[0..total]` is fully
            // written by the encoder loop below — across all
            // frames the cursor walks from 0 to `total` exactly
            // once — before `write_at` ever reads from `&buf`.
            // No uninitialised byte is ever observed.
            unsafe {
                v.set_len(total);
            }
            v
        };
        let mut cursor = 0usize;
        for record in records {
            let written = format::encode_frame_into(record, &mut buf[cursor..])?;
            cursor += written;
        }
        debug_assert_eq!(cursor, total);

        crate::platform::write_at(&self.file, start, &buf)?;

        #[cfg(feature = "tracing")]
        tracing::trace!(end_lsn = end, "buffered append_batch complete");
        Ok(Lsn(end))
    }

    /// Forces all bytes up to `lsn` to stable storage.
    ///
    /// # Group-commit semantics
    ///
    /// Concurrent `sync_through` calls coalesce into a single `fsync`
    /// syscall via a leader/follower coordinator. The first caller
    /// in each round becomes the **leader** and runs the fsync;
    /// subsequent callers waiting for an LSN ≤ the synced frontier
    /// become **followers** and return as soon as the leader's
    /// syscall completes. This is the architectural lever behind the
    /// journal's 100-700× throughput advantage over per-write
    /// atomic-replace (one syscall amortised across N callers).
    ///
    /// The 0.9.7 H-16 wake-path fix ensures followers exit via
    /// atomic-only checks on `synced_lsn` rather than re-acquiring
    /// the state mutex — a ~5× reduction in lock-hold time under
    /// 100+ concurrent followers.
    ///
    /// # Edge cases
    ///
    /// - [`Lsn::ZERO`] is the start-of-journal sentinel;
    ///   `sync_through(Lsn::ZERO)` is a guaranteed no-op.
    /// - `sync_through(lsn)` where `lsn` exceeds the highest appended
    ///   LSN syncs the currently-appended frontier (whatever
    ///   `next_lsn` is at the moment the leader captures it). The
    ///   "future" portion of `lsn` is not covered — that's the
    ///   responsibility of a subsequent `sync_through` call.
    ///
    /// # Errors
    ///
    /// - [`Error::Io`] if the underlying `fsync`/`fdatasync`/
    ///   platform-equivalent syscall fails.
    /// - [`Error::Io`] with the inner error reflecting any
    ///   buffer-flush failure on Direct-IO journals.
    ///
    /// # Examples
    ///
    /// ```no_run
    /// use fsys::builder;
    ///
    /// # fn example() -> fsys::Result<()> {
    /// let fs = builder().build()?;
    /// let log = fs.journal("/var/lib/myapp/log.wal")?;
    ///
    /// // Many cheap appends, no per-call fsync.
    /// for i in 0..1000 {
    ///     log.append(format!("record {i}").as_bytes())?;
    /// }
    ///
    /// // One group-commit fsync covers every prior append.
    /// log.sync_through(log.next_lsn())?;
    ///
    /// // Verify the durable frontier moved.
    /// assert!(log.synced_lsn() >= log.next_lsn());
    /// # Ok(())
    /// # }
    /// ```
    pub fn sync_through(&self, lsn: Lsn) -> Result<()> {
        #[cfg(feature = "tracing")]
        let _span = tracing::trace_span!(
            "fsys::journal::sync_through",
            target_lsn = lsn.0,
            direct = self.direct,
        )
        .entered();

        // Fast path: the durable frontier already covers our
        // target. The atomic load is unconditionally cheaper
        // than acquiring the group-commit state mutex.
        if self.synced_lsn.load(Ordering::Acquire) >= lsn.0 {
            #[cfg(feature = "tracing")]
            tracing::trace!(
                synced_lsn = self.synced_lsn.load(Ordering::Acquire),
                "fast-path sync skipped"
            );
            return Ok(());
        }

        // 0.9.1 leader/follower group-commit. Loop is required
        // because a follower waking to a `committed_lsn` still
        // below its target must be promoted to leader of the
        // next cycle (this happens when an appender lands a
        // record after the previous leader captured its
        // frontier). The loop body is purely state-machine
        // bookkeeping; the actual fsync runs outside the lock.
        // 0.9.2: `leader_start` captures wall-time at the
        // moment we become the leader; the observer (if any)
        // emits a single event after the fsync completes,
        // covering both the optional window-wait and the
        // syscall.
        let leader_start: Instant;
        {
            let mut state = self.group_commit.state.lock();
            loop {
                if state.committed_lsn >= lsn.0 {
                    #[cfg(feature = "tracing")]
                    tracing::trace!(
                        committed_lsn = state.committed_lsn,
                        "group-commit follower covered without leadership"
                    );
                    return Ok(());
                }
                if !state.in_flight {
                    state.in_flight = true;
                    leader_start = Instant::now();
                    break;
                }
                // Become a follower. 0.9.7 H-16 —
                // `pending_followers` is an `AtomicU32` on
                // `GroupCommit`; atomic-increment publishes the
                // join to the leader's window-check.
                let _ = self
                    .group_commit
                    .pending_followers
                    .fetch_add(1, Ordering::Release);
                // Wake leader so it can re-check max_batch
                // against the new pending_followers count.
                // `notify_one` returns whether a thread was
                // actually woken — we don't care here, the
                // leader will re-check on its own deadline if
                // it isn't currently parked.
                let _ = self.group_commit.cv_leader.notify_one();
                self.group_commit.cv_followers.wait(&mut state);
                // 0.9.7 H-16 — release the state lock
                // **immediately** after the condvar wake. Under
                // 100+ followers, the previous code held the
                // lock long enough to decrement `pending_followers`
                // and re-check `state.committed_lsn`, forcing
                // every woken follower to serialise through the
                // lock. The fast path now uses atomics only.
                drop(state);
                let _ = self
                    .group_commit
                    .pending_followers
                    .fetch_sub(1, Ordering::AcqRel);
                // Atomic-load `synced_lsn` (the public atomic
                // mirror of `state.committed_lsn`, updated by the
                // leader on commit at line ~997 with `Release`).
                // If our target is covered, return without ever
                // re-acquiring the state lock — this is the wake-
                // stampede fix.
                if self.synced_lsn.load(Ordering::Acquire) >= lsn.0 {
                    return Ok(());
                }
                // Slow path: target not yet covered (a later
                // append landed after the leader captured its
                // frontier). Re-acquire the lock and loop to
                // possibly become the next cycle's leader.
                state = self.group_commit.state.lock();
            }

            // Leader path: optionally wait `window` for
            // additional followers to enqueue, exiting early
            // once `max_batch` are present. We hold the state
            // lock during this wait; followers acquire the
            // lock briefly to call `cv_followers.wait` (which
            // releases on park), so contention is minimal.
            // 0.9.7 H-16 — `pending_followers` is atomic; the
            // leader's reads are advisory loads, not under-lock
            // reads (stale values just affect window-wait
            // timing, never correctness).
            if let Some(window) = self.group_commit.window {
                let deadline = Instant::now() + window;
                while self.group_commit.pending_followers.load(Ordering::Acquire)
                    < self.group_commit.max_batch
                {
                    let now = Instant::now();
                    if now >= deadline {
                        break;
                    }
                    let timeout = deadline - now;
                    let result = self.group_commit.cv_leader.wait_for(&mut state, timeout);
                    if result.timed_out() {
                        break;
                    }
                }
            }
            drop(state);
        }

        // Direct-IO mode: flush any partially buffered records
        // through a sector-aligned positioned write *before* the
        // fsync. 0.9.5: the LogBuffer self-locks and waits for
        // any in-flight dormant-slot flush before issuing the
        // partial flush, so this call is consistent with the
        // group-commit captured-frontier invariant.
        if let Some(log_buffer) = &self.log_buffer {
            log_buffer.flush_partial(&self.file)?;
        }

        // Capture the append frontier. We commit only up
        // through this point; subsequent appenders may extend
        // `next_lsn` further, but those records are the next
        // leader's responsibility. The captured value is a
        // conservative lower bound on what fsync will actually
        // make durable (the syscall flushes every dirty page,
        // which may include later appends).
        let frontier = self.next_lsn.load(Ordering::Acquire);

        // The actual fsync — outside both the group-commit
        // state lock and the log-buffer lock. Concurrent
        // appenders may make progress during the call; their
        // writes may or may not be covered, depending on
        // kernel scheduling.
        //
        // 0.9.4: route through `sync_mode`. `Full` (default)
        // keeps the pre-0.9.4 behaviour bit-for-bit
        // (`file.sync_data()`); `Barrier` calls
        // `platform::sync_barrier` which is cheaper on macOS
        // with PLP, identical on Linux (fdatasync is already
        // barrier-grade), no-op on Windows. See `SyncMode`
        // docs for the safety contract.
        let sync_result = match self.sync_mode {
            options::SyncMode::Full => self.file.sync_data().map_err(Error::Io),
            options::SyncMode::Barrier => crate::platform::sync_barrier(&self.file),
        };

        // Re-acquire state to publish the result. If the fsync
        // failed, we still clear `in_flight` and notify
        // followers — they will inherit the error via their
        // own retry on the next sync_through call. We do NOT
        // advance `committed_lsn` on failure, so followers
        // re-evaluate and may become the next-cycle leader
        // (where they re-attempt the fsync themselves).
        let followers_at_commit;
        {
            let mut state = self.group_commit.state.lock();
            if sync_result.is_ok() && frontier > state.committed_lsn {
                state.committed_lsn = frontier;
                self.synced_lsn.store(frontier, Ordering::Release);
            }
            // 0.9.7 H-16 — atomic-load advisory snapshot of
            // currently parked followers (for the observer
            // hook). Reading inside the lock window gives a
            // stable value for the duration of `notify_all`.
            followers_at_commit = self.group_commit.pending_followers.load(Ordering::Acquire);
            state.in_flight = false;
            // `notify_all` returns the count of woken threads;
            // we don't care for backpressure purposes — every
            // parked follower needs to re-evaluate its target.
            let _ = self.group_commit.cv_followers.notify_all();
        }

        #[cfg(feature = "tracing")]
        if sync_result.is_ok() {
            tracing::debug!(new_synced_lsn = frontier, "group-commit fsync completed");
        }

        // 0.9.2 observer hook — leader-only. Followers returned
        // early at the `committed_lsn >= lsn.0` check above
        // without ever reaching this point.
        if let Some(obs) = self.observer.as_ref() {
            obs.on_journal_sync(crate::observer::JournalSyncEvent {
                durable_lsn: frontier,
                duration: leader_start.elapsed(),
                followers_at_commit,
                error: sync_result.is_err(),
            });
        }

        sync_result
    }

    /// Returns the highest LSN currently known to be on stable
    /// storage.
    ///
    /// Increases monotonically as [`Self::sync_through`] calls
    /// complete. Useful for observability — e.g. exposing
    /// "durable bytes written" as a metric.
    #[must_use]
    #[inline]
    pub fn synced_lsn(&self) -> Lsn {
        Lsn(self.synced_lsn.load(Ordering::Acquire))
    }

    /// Returns the next LSN that would be assigned by
    /// [`Self::append`] — i.e. the current end-of-journal cursor.
    ///
    /// Useful for snapshotting / replication: `next_lsn()` at a
    /// point in time tells you "everything appended up to here."
    #[must_use]
    #[inline]
    pub fn next_lsn(&self) -> Lsn {
        Lsn(self.next_lsn.load(Ordering::Acquire))
    }

    /// Returns the [`JournalBackendKind`] currently serving this journal
    /// (1.1.0).
    ///
    /// The classification reflects which platform IO primitive is in
    /// use at the moment of the call:
    ///
    /// - [`JournalBackendKind::KernelIoUring`] — Linux + the `async`
    ///   feature + the native io_uring substrate has been
    ///   successfully constructed for this journal (typically after
    ///   the first `append_async` / `sync_through_async` call).
    /// - [`JournalBackendKind::KernelDirect`] — opened with
    ///   [`JournalOptions::direct(true)`](JournalOptions::direct).
    /// - [`JournalBackendKind::KernelBuffered`] — the default
    ///   buffered path (lock-free `pwrite` + group-commit
    ///   `fdatasync`).
    /// - [`JournalBackendKind::Spdk`] — not selectable from
    ///   [`Handle::journal`](crate::Handle::journal) in 1.1.0; SPDK
    ///   journals are constructed through the companion `fsys-spdk`
    ///   crate when it ships.
    ///
    /// Ops teams should use this accessor to verify which backend is
    /// live — without it, a silent fallback (SPDK requested, kernel
    /// path actually serving) invalidates downstream performance
    /// expectations.
    #[must_use]
    #[inline]
    pub fn backend_kind(&self) -> JournalBackendKind {
        #[cfg(all(target_os = "linux", feature = "async"))]
        {
            if let Some(Some(_)) = self.native_ring.get() {
                return JournalBackendKind::KernelIoUring;
            }
        }
        if self.direct {
            JournalBackendKind::KernelDirect
        } else {
            JournalBackendKind::KernelBuffered
        }
    }

    /// Returns a snapshot of running health counters for this
    /// journal (1.1.0).
    ///
    /// 1.1.0 ships the public counter shape ([`JournalBackendHealth`])
    /// and an accessor on every [`JournalHandle`]. The kernel-path
    /// implementation does not yet populate every counter — the
    /// counter wiring lands in a follow-up release alongside the
    /// internal trait extraction. Consumers can read the counters
    /// today; fields that are not yet wired report `0`.
    ///
    /// Polling cost is sub-microsecond — safe to call from a
    /// per-second health-check loop.
    #[must_use]
    #[inline]
    pub fn backend_health(&self) -> JournalBackendHealth {
        JournalBackendHealth::empty(self.backend_kind())
    }

    /// Returns the selection trail describing why this journal's
    /// backend was chosen (1.1.0).
    ///
    /// In 1.1.0 the kernel path is the only available backend, so
    /// the trail is always single-entry. When SPDK lands in
    /// `fsys-spdk`, the trail will include both the SPDK path
    /// (skipped, with reason) and the kernel path (selected).
    ///
    /// The [`SystemTime`](std::time::SystemTime) recorded here is
    /// approximate — it reflects the call site, not the original
    /// open time. A future refactor will plumb the actual open
    /// time through; this is documented under "what we explicitly
    /// don't promise" in `docs/STABILITY-1.0.md` (the field is
    /// stable; the exact moment it captures is approximate
    /// enough that operators should rely on the system journal,
    /// not this field, for incident timelines).
    #[must_use]
    pub fn backend_info(&self) -> JournalBackendInfo {
        let kind = self.backend_kind();
        let reason = match kind {
            JournalBackendKind::KernelIoUring => {
                "Linux io_uring native substrate active for this journal"
            }
            JournalBackendKind::KernelDirect => {
                "Direct-IO mode (JournalOptions::direct(true)) active"
            }
            JournalBackendKind::KernelBuffered => {
                "default buffered-IO mode (lock-free pwrite + group-commit fdatasync)"
            }
            JournalBackendKind::Spdk => {
                "SPDK backend in use (opened via fsys-spdk companion crate)"
            }
        };
        JournalBackendInfo::single(kind, reason)
    }

    /// Pre-allocates `len` bytes of disk space for this journal
    /// starting at `offset`. Reserves filesystem extents up-front
    /// so subsequent appends don't trigger allocation in the IO
    /// hot path. Critical for high-throughput WAL workloads
    /// where allocation jitter creates long-tail latency.
    ///
    /// Typical usage: call once after [`crate::Handle::journal`]
    /// returns, passing the expected total journal size (or a
    /// generous upper bound). The journal can then sustain writes
    /// without the filesystem allocating-on-write.
    ///
    /// `offset = 0` means "start at the beginning"; `len` is
    /// the number of bytes to reserve.
    ///
    /// # Platform behaviour
    ///
    /// - **Linux:** `fallocate(FALLOC_FL_KEEP_SIZE)` — reserves
    ///   extents without writing zeros. Falls back to
    ///   `posix_fallocate` (writes zeros) on filesystems that
    ///   don't support `fallocate`.
    /// - **macOS:** `fcntl(F_PREALLOCATE)` with contiguous
    ///   allocation; falls back to non-contiguous.
    /// - **Windows:** `SetEndOfFile` — bounds the logical size
    ///   so NTFS plans extents. True physical preallocation
    ///   (zeroing every block) requires admin privileges.
    /// - **Other platforms:** no-op (succeeds; allocation
    ///   happens on write).
    ///
    /// # Errors
    ///
    /// - [`Error::Io`] on the underlying syscall failure.
    pub fn preallocate(&self, offset: u64, len: u64) -> Result<()> {
        crate::platform::preallocate(&self.file, offset, len)
    }

    /// Hints the kernel about the access pattern for a region
    /// of this journal. The kernel uses the hint to drive
    /// page-cache prefetch / eviction / read-ahead.
    ///
    /// Hints are advisory — never affects correctness, only
    /// performance. See [`crate::Advice`] for the available
    /// hint variants.
    ///
    /// `len = 0` means "the rest of the file from `offset`."
    ///
    /// # Errors
    ///
    /// - [`Error::Io`] on the underlying syscall failure
    ///   (rare; most platforms return success even when the
    ///   hint isn't actually honoured).
    pub fn advise(&self, offset: u64, len: u64, advice: crate::Advice) -> Result<()> {
        crate::platform::advise(&self.file, offset, len, advice)
    }

    /// Performs a final sync and consumes the handle.
    ///
    /// Equivalent to `self.sync_through(self.next_lsn())` plus
    /// closing the file. Use when you want explicit
    /// success/failure reporting on the close path; otherwise,
    /// just drop the handle (the implicit close is best-effort).
    ///
    /// # Errors
    ///
    /// - [`Error::Io`] on fsync or close failure.
    pub fn close(self) -> Result<()> {
        let frontier = self.next_lsn.load(Ordering::Acquire);
        self.sync_through(Lsn(frontier))?;
        // File closes when `self` drops; explicit drop here for
        // documentation.
        drop(self);
        Ok(())
    }
}

// JournalHandle is Send + Sync (Mutex<File>, AtomicU64s, PathBuf
// are all Send + Sync). Compile-time-asserted by the next two
// lines so a future field with !Send/!Sync state is caught by
// the type system.
#[allow(dead_code)]
fn _assert_journal_handle_is_send() {
    fn require_send<T: Send>(_: &T) {}
    fn require_sync<T: Sync>(_: &T) {}
    let _ = |h: &JournalHandle| {
        require_send(h);
        require_sync(h);
    };
}

impl Drop for JournalHandle {
    fn drop(&mut self) {
        // Direct-mode best-effort flush. The user-facing
        // [`Self::close`] path is preferred (it returns errors);
        // Drop is the safety-net for handles dropped without
        // close — flush whatever's in the log buffer so the
        // partial trailing sector lands on disk before we lose
        // the writer's view of it.
        if let Some(log_buffer) = &self.log_buffer {
            // 0.9.5: self-locking LogBuffer; no outer mutex.
            let _ = log_buffer.flush_partial(&self.file);
            let _ = self.file.sync_data();
        }
    }
}

// ─────────────────────────────────────────────────────────────────
// 0.9.1 — Group-commit coordinator (leader / follower)
// ─────────────────────────────────────────────────────────────────

/// Mutable state inside the group-commit coordinator. Held under
/// a [`parking_lot::Mutex`]; never held across the actual fsync
/// syscall.
pub(crate) struct GroupCommitState {
    // 0.9.6 audit fix: all fields `pub(crate)` — accessed by
    // `src/async_io/journal.rs::sync_through_native` which is a
    // separate module from where `GroupCommitState` is defined.
    /// `true` while a leader has taken the gate and is in the
    /// process of running an fsync on behalf of itself plus any
    /// followers waiting on `cv_followers`.
    pub(crate) in_flight: bool,
    /// Highest LSN known durable on disk after the most recent
    /// completed fsync. Followers that arrive with a target LSN
    /// `≤ committed_lsn` return immediately without waiting.
    pub(crate) committed_lsn: u64,
    // `pending_followers` moved out of GroupCommitState in 0.9.7
    // H-16 — it is now an `AtomicU32` on `GroupCommit` so a
    // woken follower can decrement + early-exit without
    // re-acquiring this state mutex (the source of the audit's
    // "thundering herd" 100+ follower stampede).
}

/// 0.9.1 leader/follower group-commit coordinator. See
/// [`JournalHandle::group_commit`] field-doc for the full design
/// note; in summary:
///
/// - Leader/follower election via a single `Mutex<GroupCommitState>`.
///   The first call to [`JournalHandle::sync_through`] that finds
///   `in_flight = false` becomes the leader.
/// - Leaders never hold the state mutex during the actual fsync —
///   they set `in_flight = true`, drop the mutex, run the syscall,
///   then re-acquire to update `committed_lsn` and notify
///   followers.
/// - Followers wait on `cv_followers`. Followers that wake to a
///   `committed_lsn` still below their target loop and may
///   themselves be promoted to leader of the next cycle.
/// - Two condvars: `cv_followers` for sync-completion broadcasts,
///   `cv_leader` for leader-side wakeups when the next follower
///   joins (so the leader can re-check the `max_batch` early-exit
///   condition during its `window` wait).
pub(crate) struct GroupCommit {
    // 0.9.6 audit fix: state + cv_followers are `pub(crate)` (not
    // private) because `src/async_io/journal.rs::sync_through_native`
    // is a separate module that needs to participate in the
    // leader/follower protocol. The pre-0.9.6 fully-private fields
    // built only when `async` feature was off — the new
    // feature-matrix CI surfaced the breakage.
    pub(crate) state: PlMutex<GroupCommitState>,
    pub(crate) cv_followers: Condvar,
    cv_leader: Condvar,
    window: Option<Duration>,
    max_batch: u32,
    /// 0.9.7 H-16 — atomic counter of followers currently parked
    /// on `cv_followers`. Moved out of `GroupCommitState` so a
    /// woken follower can decrement it + early-exit without
    /// re-acquiring the state mutex (the source of the audit's
    /// "thundering herd" 100+ follower stampede after
    /// `notify_all`).
    ///
    /// Read by the leader as the early-exit hint against
    /// `max_batch` during the optional `window` follower-batch
    /// wait. The leader's reads are advisory — a stale value
    /// just means the leader may wait slightly longer or shorter
    /// than ideal, never a correctness violation.
    pub(crate) pending_followers: std::sync::atomic::AtomicU32,
}

impl GroupCommit {
    pub(crate) fn new(
        window: Option<Duration>,
        max_batch: u32,
        initial_committed_lsn: u64,
    ) -> Self {
        Self {
            state: PlMutex::new(GroupCommitState {
                in_flight: false,
                committed_lsn: initial_committed_lsn,
            }),
            cv_followers: Condvar::new(),
            cv_leader: Condvar::new(),
            window,
            max_batch,
            pending_followers: std::sync::atomic::AtomicU32::new(0),
        }
    }
}

// ─────────────────────────────────────────────────────────────────
// Internal helpers — direct-mode constructor + resume-scan
// ─────────────────────────────────────────────────────────────────

/// Scans `path` for the byte offset immediately past the last
/// cleanly-decoded frame. Used by direct-mode resume to set
/// `next_lsn` past partial / corrupted trailing bytes rather than
/// at raw `file_size`.
///
/// Returns `0` for an empty / non-existent file. Surfaces an error
/// for non-recoverable tail states (`BadMagic`, `LengthOverflow`)
/// so the caller can choose to refuse the open rather than
/// silently truncate past suspect data.
fn scan_clean_end(path: &Path) -> Result<u64> {
    let mut reader = JournalReader::open(path)?;
    if reader.file_size() == 0 {
        return Ok(0);
    }
    let mut iter = reader.iter();
    while iter.next().transpose()?.is_some() {}
    drop(iter);
    match reader.tail_state() {
        JournalTailState::CleanEnd
        | JournalTailState::TruncatedHeader
        | JournalTailState::TruncatedPayload
        | JournalTailState::ChecksumMismatch => Ok(reader.position().0),
        JournalTailState::BadMagic => Err(Error::Io(std::io::Error::new(
            std::io::ErrorKind::InvalidData,
            format!("journal at {:?} has bad magic at offset {} — refusing to open in direct mode", path, reader.position().0),
        ))),
        JournalTailState::LengthOverflow => Err(Error::Io(std::io::Error::new(
            std::io::ErrorKind::InvalidData,
            format!("journal at {:?} has frame length overflow at offset {} — refusing to open in direct mode", path, reader.position().0),
        ))),
    }
}

/// Open the journal file with the platform's Direct-IO flag.
/// Returns `(file, direct_active)`. `direct_active = false` means
/// the filesystem rejected `O_DIRECT` and the caller should fall
/// back to buffered semantics.
#[cfg(target_os = "linux")]
fn open_direct_journal(path: &Path, _sector_size: u32) -> Result<(File, bool)> {
    use std::os::fd::FromRawFd;
    let path_cstr =
        std::ffi::CString::new(path.as_os_str().to_string_lossy().as_bytes()).map_err(|_| {
            Error::Io(std::io::Error::new(
                std::io::ErrorKind::InvalidInput,
                "journal path contains a NUL byte",
            ))
        })?;
    let mut flags = libc::O_RDWR | libc::O_CREAT | libc::O_CLOEXEC | libc::O_DIRECT;
    // SAFETY: path_cstr is a valid NUL-terminated string; flags +
    // mode are valid open(2) arguments.
    let fd = unsafe { libc::open(path_cstr.as_ptr(), flags, 0o600_i32) };
    if fd >= 0 {
        // SAFETY: fd is a valid open file descriptor we just
        // created/opened.
        return Ok((unsafe { File::from_raw_fd(fd) }, true));
    }
    let err = std::io::Error::last_os_error();
    if err.raw_os_error() == Some(libc::EINVAL) {
        // O_DIRECT rejected (tmpfs, FUSE, certain CIFS mounts).
        // Retry without it; the caller falls back to buffered.
        flags &= !libc::O_DIRECT;
        // SAFETY: same as above.
        let fd2 = unsafe { libc::open(path_cstr.as_ptr(), flags, 0o600_i32) };
        if fd2 >= 0 {
            // SAFETY: fd2 is a valid open file descriptor.
            return Ok((unsafe { File::from_raw_fd(fd2) }, false));
        }
        return Err(Error::Io(std::io::Error::last_os_error()));
    }
    Err(Error::Io(err))
}

#[cfg(target_os = "macos")]
fn open_direct_journal(path: &Path, _sector_size: u32) -> Result<(File, bool)> {
    use std::os::unix::io::AsRawFd;
    let file = OpenOptions::new()
        .read(true)
        .write(true)
        .create(true)
        .truncate(false)
        .open(path)
        .map_err(Error::Io)?;
    // Set F_NOCACHE — macOS's analogue of O_DIRECT. SAFETY: the
    // fd is owned by `file`, which lives across this call.
    let ret = unsafe { libc::fcntl(file.as_raw_fd(), libc::F_NOCACHE, 1) };
    Ok((file, ret == 0))
}

#[cfg(target_os = "windows")]
fn open_direct_journal(path: &Path, _sector_size: u32) -> Result<(File, bool)> {
    use std::os::windows::ffi::OsStrExt;
    use std::os::windows::io::FromRawHandle;
    use windows_sys::Win32::Foundation::{GetLastError, INVALID_HANDLE_VALUE};
    use windows_sys::Win32::Storage::FileSystem::{
        CreateFileW, FILE_FLAG_NO_BUFFERING, FILE_FLAG_WRITE_THROUGH, FILE_GENERIC_READ,
        FILE_GENERIC_WRITE, FILE_SHARE_READ, OPEN_ALWAYS,
    };

    // Convert path to wide string with trailing NUL.
    let mut wide: Vec<u16> = path.as_os_str().encode_wide().collect();
    wide.push(0);

    // First attempt: with FILE_FLAG_NO_BUFFERING + FILE_FLAG_WRITE_THROUGH.
    // SAFETY: wide is a valid wide-encoded path with trailing NUL.
    let handle = unsafe {
        CreateFileW(
            wide.as_ptr(),
            FILE_GENERIC_READ | FILE_GENERIC_WRITE,
            FILE_SHARE_READ,
            std::ptr::null(),
            OPEN_ALWAYS,
            FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH,
            std::ptr::null_mut(),
        )
    };
    if handle != INVALID_HANDLE_VALUE && !handle.is_null() {
        // SAFETY: handle is a valid HANDLE we just opened.
        return Ok((unsafe { File::from_raw_handle(handle as _) }, true));
    }

    // SAFETY: GetLastError is a thread-local Win32 query with no
    // pre-conditions; safe to call from any thread.
    let err_code = unsafe { GetLastError() };
    // Some Windows filesystems / network shares reject
    // FILE_FLAG_NO_BUFFERING (ERROR_INVALID_PARAMETER == 87). Fall
    // back to a standard buffered open.
    if err_code == 87 {
        let file = OpenOptions::new()
            .read(true)
            .write(true)
            .create(true)
            .truncate(false)
            .open(path)
            .map_err(Error::Io)?;
        return Ok((file, false));
    }
    Err(Error::Io(std::io::Error::from_raw_os_error(
        err_code as i32,
    )))
}

#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
fn open_direct_journal(path: &Path, _sector_size: u32) -> Result<(File, bool)> {
    // No Direct-IO on unknown platforms — fall back silently.
    let file = OpenOptions::new()
        .read(true)
        .write(true)
        .create(true)
        .truncate(false)
        .open(path)
        .map_err(Error::Io)?;
    Ok((file, false))
}

/// Rehydrate the log buffer's first sector from the on-disk
/// content of the partial trailing sector. Used on resume so that
/// subsequent flushes overwrite the zero-pad cleanly without
/// destroying records.
fn rehydrate_log_buffer(
    buf: &LogBuffer,
    file: &File,
    sector_size: u32,
    resume_lsn: u64,
) -> Result<()> {
    let ss = sector_size as u64;
    let last_sector_start = (resume_lsn / ss) * ss;
    let in_sector_offset = (resume_lsn - last_sector_start) as usize;
    if in_sector_offset == 0 {
        // resume_lsn lands exactly on a sector boundary; nothing
        // to rehydrate, the buffer is already initialised to
        // (flush_pos = 0, len = 0). Move flush_pos forward.
        buf.set_flush_pos_for_resume(resume_lsn, 0, &[]);
        return Ok(());
    }
    // Read the partial trailing sector from disk.
    let bytes = crate::platform::read_range(file, last_sector_start, sector_size as usize)?;
    buf.set_flush_pos_for_resume(last_sector_start, in_sector_offset, &bytes);
    Ok(())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn lsn_default_is_zero() {
        assert_eq!(Lsn::default(), Lsn::ZERO);
        assert_eq!(Lsn::ZERO.as_u64(), 0);
    }

    #[test]
    fn lsn_display_format() {
        assert_eq!(format!("{}", Lsn(42)), "Lsn(42)");
    }

    #[test]
    fn lsn_ordering_matches_u64() {
        assert!(Lsn(100) < Lsn(200));
        assert!(Lsn(0) < Lsn(1));
        assert_eq!(Lsn(42), Lsn(42));
    }

    fn tmp_path(tag: &str) -> std::path::PathBuf {
        std::env::temp_dir().join(format!(
            "fsys_journal_test_{}_{}_{}",
            std::process::id(),
            std::time::SystemTime::now()
                .duration_since(std::time::UNIX_EPOCH)
                .unwrap()
                .as_nanos(),
            tag
        ))
    }

    struct Cleanup(std::path::PathBuf);
    impl Drop for Cleanup {
        fn drop(&mut self) {
            let _ = std::fs::remove_file(&self.0);
        }
    }

    #[test]
    fn open_creates_new_file_with_zero_lsn() {
        let path = tmp_path("new");
        let _g = Cleanup(path.clone());
        let j = JournalHandle::open(&path).expect("open");
        assert_eq!(j.next_lsn(), Lsn::ZERO);
        assert_eq!(j.synced_lsn(), Lsn::ZERO);
    }

    #[test]
    fn append_advances_lsn_by_framed_record_length() {
        // Each record is wrapped in a 12-byte frame
        // (magic + length + crc32c). LSN advances by
        // frame.len() = payload.len() + 12.
        let path = tmp_path("append");
        let _g = Cleanup(path.clone());
        let j = JournalHandle::open(&path).expect("open");

        let lsn1 = j.append(b"hello").expect("append1");
        assert_eq!(lsn1, Lsn(5 + 12));
        assert_eq!(j.next_lsn(), Lsn(17));

        let lsn2 = j.append(b" world").expect("append2");
        assert_eq!(lsn2, Lsn(17 + 6 + 12));
        assert_eq!(j.next_lsn(), Lsn(35));
    }

    #[test]
    fn append_empty_record_writes_framed_marker() {
        // Empty records are valid — they produce a 12-byte
        // header-only frame (length=0, crc over magic+length).
        // Useful for marking checkpoints / transaction
        // boundaries in the journal stream.
        let path = tmp_path("empty_record");
        let _g = Cleanup(path.clone());
        let j = JournalHandle::open(&path).expect("open");
        let _ = j.append(b"first").expect("first");
        assert_eq!(j.next_lsn(), Lsn(5 + 12));
        let lsn = j.append(b"").expect("empty");
        assert_eq!(lsn, Lsn(17 + 12));
        assert_eq!(j.next_lsn(), Lsn(29));
    }

    // ─────────────────────────────────────────────────────────
    // 0.9.7 M-11 — boundary-condition tests
    //
    // Audit M-11: existing coverage included min/max log-buffer
    // sizes and oversize records; missing the single-byte,
    // exact-sector-size, exact-page-size, and zero-length-batch
    // boundaries. These tests close those specific gaps with
    // buffered-mode journals (deterministic, platform-independent,
    // no Direct-IO alignment requirements).
    // ─────────────────────────────────────────────────────────

    #[test]
    fn append_one_byte_record_produces_13_byte_frame() {
        // Smallest non-empty payload: 1 byte. Frame total =
        // FRAME_OVERHEAD (12) + 1 = 13 bytes. Distinguishes the
        // 1-byte path from the 0-byte path
        // (`append_empty_record_writes_framed_marker` covers the
        // empty case); both must coexist without aliasing.
        let path = tmp_path("one_byte");
        let _g = Cleanup(path.clone());
        let j = JournalHandle::open(&path).expect("open");
        let lsn = j.append(b"x").expect("append");
        assert_eq!(lsn.as_u64(), 13, "1-byte payload + 12-byte overhead = 13");
        assert_eq!(j.next_lsn().as_u64(), 13);

        // Second 1-byte append accumulates correctly.
        let lsn2 = j.append(b"y").expect("append 2");
        assert_eq!(lsn2.as_u64(), 26, "two 1-byte frames = 26");
        j.close().expect("close");

        // Round-trip read confirms both bytes are recoverable.
        let mut reader = JournalReader::open(&path).expect("reader");
        let payloads: Vec<Vec<u8>> = reader.iter().map(|r| r.expect("record").payload).collect();
        assert_eq!(payloads, vec![b"x".to_vec(), b"y".to_vec()]);
    }

    #[test]
    fn append_frame_at_exact_4kb_boundary_round_trips() {
        // Payload sized so the frame total is exactly 4096 bytes
        // (4 KiB) — the most common page size on Linux/x86_64
        // and Windows, and the typical NVMe logical-sector size.
        // A bug at this boundary (off-by-one in framing,
        // alignment-aware code path falling over at exact-page
        // size) would manifest here.
        let payload_len = 4096 - format::FRAME_OVERHEAD;
        let payload = vec![0xC3u8; payload_len];

        let path = tmp_path("exact_4kb");
        let _g = Cleanup(path.clone());
        let j = JournalHandle::open(&path).expect("open");
        let lsn = j.append(&payload).expect("append");
        assert_eq!(lsn.as_u64(), 4096, "frame total at exact 4 KiB boundary");
        j.close().expect("close");

        // Read-back: the payload must come back byte-for-byte.
        let mut reader = JournalReader::open(&path).expect("reader");
        let rec = reader
            .iter()
            .next()
            .expect("record present")
            .expect("decode");
        assert_eq!(rec.payload.len(), payload_len);
        assert!(
            rec.payload.iter().all(|&b| b == 0xC3),
            "payload content drift at 4 KiB boundary"
        );
    }

    #[test]
    fn append_frame_at_exact_16kb_boundary_round_trips() {
        // Payload sized so the frame total is exactly 16384
        // bytes (16 KiB) — Apple Silicon's native page size and
        // a common large-block boundary on modern NVMe. Catches
        // any 16-bit-arithmetic edge cases in framing.
        let payload_len = 16384 - format::FRAME_OVERHEAD;
        let payload = vec![0x5Au8; payload_len];

        let path = tmp_path("exact_16kb");
        let _g = Cleanup(path.clone());
        let j = JournalHandle::open(&path).expect("open");
        let lsn = j.append(&payload).expect("append");
        assert_eq!(lsn.as_u64(), 16384, "frame total at exact 16 KiB boundary");
        j.close().expect("close");

        let mut reader = JournalReader::open(&path).expect("reader");
        let rec = reader
            .iter()
            .next()
            .expect("record present")
            .expect("decode");
        assert_eq!(rec.payload.len(), payload_len);
        assert!(rec.payload.iter().all(|&b| b == 0x5A));
    }

    #[test]
    fn append_frame_at_exact_64kb_boundary_round_trips() {
        // Payload sized so the frame total is exactly 64 KiB —
        // the default log-buffer slot size for Direct mode AND
        // a common alignment unit on modern storage. A buffered-
        // mode test still exercises the framing layer cleanly
        // without bringing in Direct-IO requirements.
        let payload_len = 65536 - format::FRAME_OVERHEAD;
        let payload = vec![0xA5u8; payload_len];

        let path = tmp_path("exact_64kb");
        let _g = Cleanup(path.clone());
        let j = JournalHandle::open(&path).expect("open");
        let lsn = j.append(&payload).expect("append");
        assert_eq!(lsn.as_u64(), 65536, "frame total at exact 64 KiB boundary");
        j.close().expect("close");

        let mut reader = JournalReader::open(&path).expect("reader");
        let rec = reader
            .iter()
            .next()
            .expect("record present")
            .expect("decode");
        assert_eq!(rec.payload.len(), payload_len);
        assert!(rec.payload.iter().all(|&b| b == 0xA5));
    }

    #[test]
    fn append_batch_single_one_byte_record_round_trips() {
        // Batch path with a single 1-byte record. Distinct from
        // `append_batch_single_record_matches_append` (which uses
        // a multi-byte payload) and from
        // `append_one_byte_record_produces_13_byte_frame` (which
        // uses the single-record `append` path). Exercises the
        // batch fast-path on the smallest possible non-empty
        // record.
        let path = tmp_path("batch_one_byte");
        let _g = Cleanup(path.clone());
        let j = JournalHandle::open(&path).expect("open");
        let payload: &[u8] = b"\x42";
        let lsn = j.append_batch(&[payload]).expect("append_batch");
        assert_eq!(lsn.as_u64(), 13);
        j.close().expect("close");

        let mut reader = JournalReader::open(&path).expect("reader");
        let rec = reader.iter().next().expect("present").expect("decode");
        assert_eq!(rec.payload, vec![0x42]);
        assert!(reader.iter().next().is_none(), "exactly one record");
    }

    #[test]
    fn append_batch_only_empty_records_round_trips() {
        // Batch of all-empty records: each produces a 12-byte
        // header-only frame; total advance = N * 12. Exercises
        // the batch path with zero-payload edge case. Catches
        // any "must have some payload" assumption in the batch
        // encoder.
        let path = tmp_path("batch_empty_records");
        let _g = Cleanup(path.clone());
        let j = JournalHandle::open(&path).expect("open");
        let empties: [&[u8]; 4] = [b"", b"", b"", b""];
        let lsn = j.append_batch(&empties).expect("append_batch");
        assert_eq!(lsn.as_u64(), 4 * format::FRAME_OVERHEAD as u64);
        j.close().expect("close");

        // Read-back: four distinct empty records.
        let mut reader = JournalReader::open(&path).expect("reader");
        let payloads: Vec<Vec<u8>> = reader.iter().map(|r| r.expect("decode").payload).collect();
        assert_eq!(payloads.len(), 4, "all four empty records present");
        for (i, p) in payloads.iter().enumerate() {
            assert!(p.is_empty(), "record {i} should be empty");
        }
    }

    #[test]
    fn append_batch_mixed_empty_and_small_records_round_trips() {
        // Mixed batch — [empty, 1-byte, empty, 1-byte] — exercises
        // the batch encoder under heterogeneous record sizes
        // including the empty-record boundary in the middle of
        // the batch. Each record gets its own frame; LSN
        // advance is the sum of frame sizes.
        let path = tmp_path("batch_mixed_empty");
        let _g = Cleanup(path.clone());
        let j = JournalHandle::open(&path).expect("open");
        let mixed: [&[u8]; 4] = [b"", b"a", b"", b"b"];
        let lsn = j.append_batch(&mixed).expect("append_batch");
        // Frame sizes: 12 + 13 + 12 + 13 = 50
        let expected_total =
            (2 * format::FRAME_OVERHEAD as u64) + (2 * (1 + format::FRAME_OVERHEAD as u64));
        assert_eq!(lsn.as_u64(), expected_total);
        j.close().expect("close");

        let mut reader = JournalReader::open(&path).expect("reader");
        let payloads: Vec<Vec<u8>> = reader.iter().map(|r| r.expect("decode").payload).collect();
        assert_eq!(
            payloads,
            vec![Vec::new(), b"a".to_vec(), Vec::new(), b"b".to_vec()]
        );
    }

    #[test]
    fn sync_through_zero_is_noop() {
        let path = tmp_path("sync_zero");
        let _g = Cleanup(path.clone());
        let j = JournalHandle::open(&path).expect("open");
        j.sync_through(Lsn::ZERO).expect("sync_through(0)");
        assert_eq!(j.synced_lsn(), Lsn::ZERO);
    }

    #[test]
    fn sync_through_advances_synced_lsn() {
        let path = tmp_path("sync_advance");
        let _g = Cleanup(path.clone());
        let j = JournalHandle::open(&path).expect("open");
        let lsn = j.append(b"durable").expect("append");
        j.sync_through(lsn).expect("sync");
        assert!(j.synced_lsn() >= lsn);
    }

    #[test]
    fn append_then_reopen_resumes_at_existing_size() {
        let path = tmp_path("resume");
        let _g = Cleanup(path.clone());
        {
            let j = JournalHandle::open(&path).expect("open1");
            let _ = j.append(b"persist this").expect("append");
            j.close().expect("close");
        }
        let j2 = JournalHandle::open(&path).expect("reopen");
        // 12-byte payload + 12-byte frame overhead = 24 bytes.
        assert_eq!(j2.next_lsn(), Lsn(24));
        assert_eq!(j2.synced_lsn(), Lsn(24));
    }

    #[test]
    fn append_writes_framed_records_to_file() {
        // Verify the on-disk format: each record is wrapped in
        // its frame. We confirm the file size matches the sum
        // of (payload + FRAME_OVERHEAD) per record, and that
        // the frame can be decoded back to the original payload.
        let path = tmp_path("readback");
        let _g = Cleanup(path.clone());
        let j = JournalHandle::open(&path).expect("open");
        let _ = j.append(b"alpha").expect("a1");
        let _ = j.append(b"beta").expect("a2");
        j.close().expect("close");
        let bytes = std::fs::read(&path).expect("read");

        // Total: (5 + 12) + (4 + 12) = 33 bytes.
        assert_eq!(bytes.len(), 33);

        // Decode frame 1.
        match format::decode_frame(&bytes) {
            format::FrameDecode::Ok {
                consumed,
                payload_start,
                payload_end,
            } => {
                assert_eq!(consumed, 17);
                assert_eq!(&bytes[payload_start..payload_end], b"alpha");

                // Decode frame 2 starting at offset 17.
                match format::decode_frame(&bytes[17..]) {
                    format::FrameDecode::Ok {
                        consumed,
                        payload_start,
                        payload_end,
                    } => {
                        assert_eq!(consumed, 16);
                        assert_eq!(&bytes[17 + payload_start..17 + payload_end], b"beta");
                    }
                    other => panic!("frame 2 decode failed: {other:?}"),
                }
            }
            other => panic!("frame 1 decode failed: {other:?}"),
        }
    }

    // ─────────────────────────────────────────────────────────
    // 0.9.1 — `append_batch` coverage
    // ─────────────────────────────────────────────────────────

    #[test]
    fn append_batch_empty_is_noop() {
        let path = tmp_path("batch_empty");
        let _g = Cleanup(path.clone());
        let j = JournalHandle::open(&path).expect("open");
        // Empty batch returns the current next-write position
        // and writes nothing.
        let lsn = j.append_batch(&[]).expect("append_batch empty");
        assert_eq!(lsn, Lsn::ZERO);
        assert_eq!(j.next_lsn(), Lsn::ZERO);
        // After a real append, an empty batch returns the new
        // frontier — not zero — confirming it reads `next_lsn`.
        let _ = j.append(b"first").expect("append");
        let lsn2 = j.append_batch(&[]).expect("append_batch empty 2");
        assert_eq!(lsn2, j.next_lsn());
    }

    #[test]
    fn append_batch_single_record_matches_append() {
        // Parity test: a single-record batch must produce the
        // same on-disk bytes and same LSN as the equivalent
        // single `append` call.
        let path_a = tmp_path("batch_single_a");
        let _ga = Cleanup(path_a.clone());
        let j_a = JournalHandle::open(&path_a).expect("open a");
        let lsn_a = j_a.append(b"singleton").expect("append");
        j_a.close().expect("close a");
        let bytes_a = std::fs::read(&path_a).expect("read a");

        let path_b = tmp_path("batch_single_b");
        let _gb = Cleanup(path_b.clone());
        let j_b = JournalHandle::open(&path_b).expect("open b");
        let payload: &[u8] = b"singleton";
        let lsn_b = j_b.append_batch(&[payload]).expect("append_batch");
        j_b.close().expect("close b");
        let bytes_b = std::fs::read(&path_b).expect("read b");

        assert_eq!(lsn_a, lsn_b);
        assert_eq!(bytes_a, bytes_b);
    }

    #[test]
    fn append_batch_multi_record_parity_with_append_loop() {
        // The load-bearing parity test for the bulk-load fix:
        // append_batch must produce exactly the same on-disk
        // byte sequence as N independent `append` calls, and
        // return the same final LSN.
        let payloads: Vec<&[u8]> = vec![b"alpha", b"beta", b"", b"gamma!", b"delta-payload-x"];

        let path_loop = tmp_path("batch_parity_loop");
        let _g1 = Cleanup(path_loop.clone());
        let j_loop = JournalHandle::open(&path_loop).expect("open loop");
        let mut last_loop = Lsn::ZERO;
        for p in &payloads {
            last_loop = j_loop.append(p).expect("append");
        }
        j_loop.close().expect("close loop");
        let bytes_loop = std::fs::read(&path_loop).expect("read loop");

        let path_batch = tmp_path("batch_parity_batch");
        let _g2 = Cleanup(path_batch.clone());
        let j_batch = JournalHandle::open(&path_batch).expect("open batch");
        let last_batch = j_batch.append_batch(&payloads).expect("append_batch");
        j_batch.close().expect("close batch");
        let bytes_batch = std::fs::read(&path_batch).expect("read batch");

        assert_eq!(last_loop, last_batch);
        assert_eq!(bytes_loop, bytes_batch);
    }

    #[test]
    fn append_batch_returns_end_lsn_of_last_record() {
        let path = tmp_path("batch_lsn");
        let _g = Cleanup(path.clone());
        let j = JournalHandle::open(&path).expect("open");
        // Three records of payload sizes 5, 4, 7 → frame sizes
        // 17, 16, 19 → cumulative = 17, 33, 52.
        let p1: &[u8] = b"hello";
        let p2: &[u8] = b"abcd";
        let p3: &[u8] = b"journal";
        let lsn = j.append_batch(&[p1, p2, p3]).expect("append_batch");
        assert_eq!(lsn, Lsn(52));
        assert_eq!(j.next_lsn(), Lsn(52));
    }

    #[test]
    fn append_batch_decodes_back_via_journal_reader() {
        // End-to-end round trip: write a batch, close, reopen
        // via JournalReader, confirm every record decodes in
        // submission order with the original payload bytes.
        let path = tmp_path("batch_readback");
        let _g = Cleanup(path.clone());
        let j = JournalHandle::open(&path).expect("open");
        let originals: Vec<Vec<u8>> = (0..32)
            .map(|i| {
                let mut v = vec![0u8; 64 + (i % 17)];
                for (k, b) in v.iter_mut().enumerate() {
                    *b = ((i.wrapping_mul(31).wrapping_add(k)) & 0xFF) as u8;
                }
                v
            })
            .collect();
        let refs: Vec<&[u8]> = originals.iter().map(|v| v.as_slice()).collect();
        let _ = j.append_batch(&refs).expect("append_batch");
        j.close().expect("close");

        let mut reader = JournalReader::open(&path).expect("reader open");
        let mut decoded: Vec<Vec<u8>> = Vec::new();
        let mut iter = reader.iter();
        while let Some(rec) = iter.next().transpose().expect("decode") {
            decoded.push(rec.payload.to_vec());
        }
        assert_eq!(decoded, originals);
    }

    #[test]
    fn append_batch_oversize_record_rejected() {
        // Synthesise a single-record batch where the "record"
        // claims a length above FRAME_MAX_PAYLOAD. We can't
        // actually allocate 256 MiB cheaply, so the bound is
        // checked against the slice length cap instead — but
        // we can confirm the validation path by passing a
        // record with len that exceeds the cap if such an
        // allocation existed. Instead we exercise the
        // total-size overflow branch with a synthesised slice
        // whose `len()` is above the cap — only safe via Vec
        // up to ~256 MiB; we skip the actual oversize alloc
        // and instead confirm the empty-batch + single-record
        // path is well-defined.
        //
        // The real protection is exercised in the `format`
        // module's `length_overflow_rejected_at_encode` test;
        // here we just confirm append_batch surfaces an error
        // for a normal, in-range record without crashing.
        let path = tmp_path("batch_oversize_smoke");
        let _g = Cleanup(path.clone());
        let j = JournalHandle::open(&path).expect("open");
        let p: &[u8] = b"normal";
        let lsn = j.append_batch(&[p]).expect("append_batch");
        assert_eq!(lsn, Lsn(6 + 12));
    }

    #[test]
    fn append_batch_concurrent_appenders_serialise_via_atomic() {
        // Two threads each invoke append_batch concurrently;
        // each batch's records must remain contiguous on disk
        // (the LSN-reservation invariant), even though the two
        // batches may interleave in any order. Verified by
        // checking that the JournalReader decodes exactly the
        // expected number of records and that each batch's
        // records appear in submission order somewhere in the
        // stream.
        use std::sync::Arc;
        let path = tmp_path("batch_concurrent");
        let _g = Cleanup(path.clone());
        let j = Arc::new(JournalHandle::open(&path).expect("open"));

        let p_a: Vec<Vec<u8>> = (0..8).map(|i| format!("a{i:02}").into_bytes()).collect();
        let p_b: Vec<Vec<u8>> = (0..8).map(|i| format!("b{i:02}").into_bytes()).collect();

        let j_a = j.clone();
        let p_a_clone = p_a.clone();
        let h_a = std::thread::spawn(move || {
            let refs: Vec<&[u8]> = p_a_clone.iter().map(|v| v.as_slice()).collect();
            j_a.append_batch(&refs).expect("batch a")
        });
        let j_b = j.clone();
        let p_b_clone = p_b.clone();
        let h_b = std::thread::spawn(move || {
            let refs: Vec<&[u8]> = p_b_clone.iter().map(|v| v.as_slice()).collect();
            j_b.append_batch(&refs).expect("batch b")
        });
        let _ = h_a.join().expect("join a");
        let _ = h_b.join().expect("join b");
        j.sync_through(j.next_lsn()).expect("sync");
        drop(j);

        // 16 records expected.
        let mut reader = JournalReader::open(&path).expect("reader");
        let mut decoded: Vec<Vec<u8>> = Vec::new();
        let mut iter = reader.iter();
        while let Some(rec) = iter.next().transpose().expect("decode") {
            decoded.push(rec.payload.to_vec());
        }
        assert_eq!(decoded.len(), 16);

        // Find each batch's contiguous run and confirm its
        // records appear in submission order.
        let bytes_a: Vec<Vec<u8>> = p_a;
        let bytes_b: Vec<Vec<u8>> = p_b;
        let pos_a = decoded
            .iter()
            .position(|r| r == &bytes_a[0])
            .expect("first a in stream");
        for (k, original) in bytes_a.iter().enumerate() {
            assert_eq!(&decoded[pos_a + k], original);
        }
        let pos_b = decoded
            .iter()
            .position(|r| r == &bytes_b[0])
            .expect("first b in stream");
        for (k, original) in bytes_b.iter().enumerate() {
            assert_eq!(&decoded[pos_b + k], original);
        }
    }

    #[test]
    fn append_batch_resume_after_close_reopen() {
        // Confirm batch-written records survive a close/reopen
        // cycle: next_lsn comes back at the right position and
        // appending more records continues sequentially.
        let path = tmp_path("batch_resume");
        let _g = Cleanup(path.clone());
        {
            let j = JournalHandle::open(&path).expect("open1");
            let p: &[&[u8]] = &[b"persist1", b"persist2", b"persist3"];
            let _ = j.append_batch(p).expect("batch");
            j.close().expect("close");
        }
        let j2 = JournalHandle::open(&path).expect("reopen");
        // Each frame: 8 + 8 + 8 payload bytes; 12-byte overhead each.
        // Total file size = (8+12)*3 = 60.
        assert_eq!(j2.next_lsn(), Lsn(60));
        let next = j2.append(b"after").expect("append after batch");
        assert_eq!(next, Lsn(60 + 5 + 12));
    }

    // ─────────────────────────────────────────────────────────
    // 0.9.1 — leader/follower group-commit coverage
    // ─────────────────────────────────────────────────────────

    #[test]
    fn group_commit_window_none_disables_batching() {
        // With group_commit_window = None, sync_through fsyncs
        // immediately on first call without waiting. We can't
        // measure the absence of a wait deterministically, but
        // we can confirm the path is taken: sync_through
        // succeeds and synced_lsn advances correctly.
        let path = tmp_path("gc_window_none");
        let _g = Cleanup(path.clone());
        let opts = JournalOptions::new().group_commit_window(None);
        let j = JournalHandle::open_with_options(&path, opts).expect("open");
        let lsn = j.append(b"x").expect("append");
        j.sync_through(lsn).expect("sync");
        assert!(j.synced_lsn() >= lsn);
    }

    #[test]
    fn group_commit_window_some_succeeds() {
        let path = tmp_path("gc_window_some");
        let _g = Cleanup(path.clone());
        let opts = JournalOptions::new()
            .group_commit_window(Some(Duration::from_micros(100)))
            .group_commit_max_batch(4);
        let j = JournalHandle::open_with_options(&path, opts).expect("open");
        let lsn = j.append(b"y").expect("append");
        j.sync_through(lsn).expect("sync");
        assert!(j.synced_lsn() >= lsn);
    }

    #[test]
    fn group_commit_follower_promoted_when_target_above_leader_frontier() {
        // Stress the follower-promotion loop in sync_through.
        // Spawn many threads that each append + sync; some
        // will arrive while a leader is in flight and need
        // their target covered by a later cycle. All threads
        // must successfully observe their target as durable.
        use std::sync::Arc;
        let path = tmp_path("gc_follower_promote");
        let _g = Cleanup(path.clone());
        let opts = JournalOptions::new().group_commit_max_batch(4);
        let j = Arc::new(JournalHandle::open_with_options(&path, opts).expect("open"));

        let mut handles = Vec::new();
        for thread_id in 0..16u32 {
            let j = j.clone();
            handles.push(std::thread::spawn(move || {
                for round in 0..8u32 {
                    let payload = format!("t{thread_id:02}r{round:02}");
                    let lsn = j.append(payload.as_bytes()).expect("append");
                    j.sync_through(lsn).expect("sync");
                    assert!(j.synced_lsn() >= lsn);
                }
            }));
        }
        for h in handles {
            h.join().expect("join");
        }
    }

    #[test]
    fn group_commit_idempotent_when_already_synced() {
        // A sync_through call whose target is already covered
        // must take the fast path (no fsync, no leadership, no
        // mutex acquisition beyond the atomic load).
        let path = tmp_path("gc_idempotent");
        let _g = Cleanup(path.clone());
        let j = JournalHandle::open(&path).expect("open");
        let lsn = j.append(b"sync me").expect("append");
        j.sync_through(lsn).expect("first sync");
        // Second call to the same lsn must succeed and not
        // require any further fsync (we can't observe absence
        // of fsync directly, but the path returns Ok).
        j.sync_through(lsn).expect("second sync");
        // sync_through(ZERO) is always a fast-path no-op.
        j.sync_through(Lsn::ZERO).expect("zero sync");
        assert!(j.synced_lsn() >= lsn);
    }

    /// 0.9.7 H-16 follow-up — wake-stampede stress test.
    ///
    /// Spawns 64 follower threads + 1 leader, all racing to
    /// sync the same record. Validates:
    ///
    /// 1. **No deadlock** — every thread joins within a finite
    ///    timeout. Under the pre-0.9.7 lock-protected
    ///    `pending_followers` design, a 64-thread stampede
    ///    serialised through the state mutex on every cycle;
    ///    a real-world deadlock here would be a missed-wakeup
    ///    regression.
    /// 2. **All followers see their target as durable.** Every
    ///    thread's `sync_through` returns `Ok(())` and
    ///    `synced_lsn() >= their_lsn` after return.
    /// 3. **`pending_followers` returns to zero.** After all
    ///    threads join, the atomic counter is back at 0 —
    ///    no follower leaked the counter on an early-exit path.
    ///
    /// The test does NOT assert absolute wall-clock timing
    /// (CI variance defeats µs-level claims). The audit's
    /// estimated 0.5-2 µs/follower win is documented in the
    /// commit message; this test validates **structural
    /// correctness** of the atomic-decrement + lock-free
    /// early-exit path under the contention level the audit
    /// flagged.
    ///
    /// 64 followers (rather than the audit's exact 100+
    /// number) is chosen to keep this test viable on shared
    /// CI runners where parallel cargo-test execution can
    /// starve a 100+ thread spawn. The stampede semantics —
    /// `notify_all` waking N parked threads that all serialise
    /// through the state mutex — kick in at any N > 1; 64 is
    /// well within the regime the audit was concerned about.
    #[test]
    fn group_commit_wake_stampede_64_followers() {
        use std::sync::{Arc, Barrier};

        const FOLLOWER_COUNT: usize = 64;
        const BUDGET: Duration = Duration::from_secs(60);

        let path = tmp_path("gc_wake_stampede_64");
        let _g = Cleanup(path.clone());
        // Small window so the leader doesn't burn the whole
        // 60s budget waiting for follower batching — the
        // stampede happens on the wake side regardless of
        // window size.
        let opts = JournalOptions::new()
            .group_commit_window(Some(Duration::from_micros(100)))
            .group_commit_max_batch(FOLLOWER_COUNT as u32);
        let j = Arc::new(JournalHandle::open_with_options(&path, opts).expect("open"));

        // One appender writes a single record; every thread
        // syncs through that record's LSN. This is the
        // worst-case stampede pattern: N threads concurrently
        // race to sync the same target, exactly one becomes
        // leader, N-1 park on `cv_followers`, then notify_all
        // wakes them en masse.
        let target_lsn = j.append(b"stampede-target-record").expect("append");

        // `Barrier` synchronises all FOLLOWER_COUNT threads at
        // the gate without spin-loops — spinning would burn CPU
        // cores and starve other concurrently-running tests
        // (cargo test runs the lib-test binary's tests in
        // parallel, so a spin-gate here doubles as a denial-of-
        // service against the rest of the suite).
        let gate = Arc::new(Barrier::new(FOLLOWER_COUNT));
        let start = Instant::now();
        let mut handles = Vec::with_capacity(FOLLOWER_COUNT);
        for _ in 0..FOLLOWER_COUNT {
            let j = j.clone();
            let gate = gate.clone();
            handles.push(std::thread::spawn(move || {
                // Park at the barrier until every thread is
                // ready, maximising the chance that they all
                // hit `sync_through` concurrently.
                let _ = gate.wait();
                j.sync_through(target_lsn).expect("follower sync");
                // Verify the contract: post-return, our target
                // must be at-or-below the durable frontier.
                assert!(
                    j.synced_lsn() >= target_lsn,
                    "follower returned but durable frontier still below target",
                );
            }));
        }
        for h in handles {
            h.join().expect("follower thread join");
        }
        let elapsed = start.elapsed();
        assert!(
            elapsed < BUDGET,
            "wake-stampede 64-follower test exceeded {BUDGET:?} budget: {elapsed:?} — \
             possible missed-wakeup regression",
        );
        // `pending_followers` must be back at zero after every
        // follower joined. A leaked counter would surface a bug
        // in either the increment or the atomic-decrement path.
        assert_eq!(
            j.group_commit.pending_followers.load(Ordering::Acquire),
            0,
            "pending_followers leaked a count — increment/decrement asymmetry",
        );
        assert!(j.synced_lsn() >= target_lsn);
    }

    #[test]
    fn group_commit_batching_with_8_threads() {
        // Mirror of the v0.8.5 emdb group-commit harness: 8
        // producer threads, each writing one record then
        // calling sync_through, with the v0.8.5-default tuning
        // (window = 500 µs, max_batch = 8). All threads must
        // observe their target as durable, and the test must
        // complete within a reasonable wall-time budget.
        use std::sync::Arc;
        let path = tmp_path("gc_eight_threads");
        let _g = Cleanup(path.clone());
        let opts = JournalOptions::new()
            .group_commit_window(Some(Duration::from_micros(500)))
            .group_commit_max_batch(8);
        let j = Arc::new(JournalHandle::open_with_options(&path, opts).expect("open"));

        let start = Instant::now();
        let mut handles = Vec::new();
        for thread_id in 0..8u32 {
            let j = j.clone();
            handles.push(std::thread::spawn(move || {
                for record in 0..50u32 {
                    let payload = format!("th{thread_id:02}rc{record:04}");
                    let lsn = j.append(payload.as_bytes()).expect("append");
                    j.sync_through(lsn).expect("sync");
                }
            }));
        }
        for h in handles {
            h.join().expect("join");
        }
        // Sanity ceiling — even on a slow CI box this should
        // finish in well under 30 s. The test is here for
        // correctness (no deadlocks, no missed wakeups), not
        // for timing assertion strength.
        let elapsed = start.elapsed();
        assert!(
            elapsed < Duration::from_secs(30),
            "group-commit harness exceeded 30s wall budget: {elapsed:?}"
        );
        assert!(j.synced_lsn().0 > 0);
    }

    /// 0.9.1 sanity bench: confirms `append_batch` is meaningfully
    /// faster than `append` in a tight loop. Run on demand with:
    ///
    /// ```sh
    /// cargo test --release --lib --features stress -- \
    ///     --ignored append_batch_sanity_bench --nocapture
    /// ```
    ///
    /// `#[ignore]` because micro-timings depend heavily on the
    /// host filesystem and shouldn't gate CI; this is a manual
    /// verification harness, not a regression assertion.
    #[test]
    #[ignore = "manual sanity bench; run with --ignored --nocapture"]
    #[allow(clippy::print_stdout)] // bench test prints timings on `--nocapture`
    fn append_batch_sanity_bench() {
        const N: usize = 10_000;
        const PAYLOAD_LEN: usize = 150;
        const ROUNDS: usize = 5;

        let payload = vec![0xABu8; PAYLOAD_LEN];

        // Best-of-N to suppress single-run page-cache jitter.
        // The Windows NTFS write-back cache makes repeated tight
        // syscalls cheaper than a single large write at the
        // ~megabyte scale, so the loop path looks artificially
        // fast on a *hot* page cache. Taking the minimum across
        // multiple rounds approximates the worst-case syscall
        // ceiling. On Linux + bare-metal NVMe the wins are
        // larger and more consistent — this Windows-friendly
        // floor is what the assertion gates.
        let mut best_loop = std::time::Duration::from_secs(u64::MAX);
        let mut best_batch = std::time::Duration::from_secs(u64::MAX);
        for round in 0..ROUNDS {
            // Loop path
            let path_loop = tmp_path(&format!("bench_loop_{round}"));
            let _g1 = Cleanup(path_loop.clone());
            let j_loop = JournalHandle::open(&path_loop).expect("open");
            let t0 = Instant::now();
            for _ in 0..N {
                let _ = j_loop.append(&payload).expect("append");
            }
            j_loop.sync_through(j_loop.next_lsn()).expect("sync");
            let dur_loop = t0.elapsed();
            if dur_loop < best_loop {
                best_loop = dur_loop;
            }
            drop(j_loop);

            // Batch path
            let path_batch = tmp_path(&format!("bench_batch_{round}"));
            let _g2 = Cleanup(path_batch.clone());
            let j_batch = JournalHandle::open(&path_batch).expect("open");
            let refs: Vec<&[u8]> = std::iter::repeat(payload.as_slice()).take(N).collect();
            let t0 = Instant::now();
            let _ = j_batch.append_batch(&refs).expect("batch");
            j_batch.sync_through(j_batch.next_lsn()).expect("sync");
            let dur_batch = t0.elapsed();
            if dur_batch < best_batch {
                best_batch = dur_batch;
            }
            drop(j_batch);
        }

        let speedup = best_loop.as_secs_f64() / best_batch.as_secs_f64();
        println!("append loop  ({N}× {PAYLOAD_LEN} B), best of {ROUNDS}: {best_loop:?}");
        println!("append_batch ({N}× {PAYLOAD_LEN} B), best of {ROUNDS}: {best_batch:?}");
        println!("speedup: {speedup:.2}×");
        // Assertion threshold is the Windows-page-cache floor;
        // on Linux + NVMe expect 3–10× depending on payload
        // size and concurrent appender count.
        assert!(
            speedup > 1.2,
            "append_batch must be at least 1.2× faster than append-loop; got {speedup:.2}×"
        );
    }

    // ─────────────────────────────────────────────────────────
    // 0.9.2 — FsysObserver integration coverage
    // ─────────────────────────────────────────────────────────

    #[test]
    fn observer_fires_on_append_and_sync() {
        use crate::observer::{FsysObserver, JournalAppendEvent, JournalSyncEvent};
        use std::sync::atomic::AtomicU64;
        use std::sync::Arc;

        #[derive(Debug, Default)]
        struct Counts {
            append_calls: AtomicU64,
            append_records: AtomicU64,
            append_bytes: AtomicU64,
            sync_calls: AtomicU64,
            last_durable_lsn: AtomicU64,
        }
        impl FsysObserver for Counts {
            fn on_journal_append(&self, e: JournalAppendEvent) {
                let _ = self.append_calls.fetch_add(1, Ordering::Relaxed);
                let _ = self
                    .append_records
                    .fetch_add(u64::from(e.records), Ordering::Relaxed);
                let _ = self
                    .append_bytes
                    .fetch_add(e.bytes_written, Ordering::Relaxed);
            }
            fn on_journal_sync(&self, e: JournalSyncEvent) {
                let _ = self.sync_calls.fetch_add(1, Ordering::Relaxed);
                self.last_durable_lsn
                    .store(e.durable_lsn, Ordering::Relaxed);
            }
        }

        let path = tmp_path("observer_e2e");
        let _g = Cleanup(path.clone());
        let counts = Arc::new(Counts::default());
        let fs = crate::builder()
            .observer(counts.clone() as Arc<dyn FsysObserver>)
            .build()
            .expect("build");
        let log = fs.journal(&path).expect("journal");

        // Three single appends + one batch of three.
        let _ = log.append(b"alpha").expect("a1");
        let _ = log.append(b"beta").expect("a2");
        let _ = log.append(b"gamma").expect("a3");
        let last = log
            .append_batch(&[b"delta" as &[u8], b"epsilon", b"zeta"])
            .expect("batch");
        log.sync_through(last).expect("sync");

        // 3 append + 1 batch = 4 append-events; batch carried 3
        // records, single appends carried 1 each.
        assert_eq!(counts.append_calls.load(Ordering::Relaxed), 4);
        assert_eq!(counts.append_records.load(Ordering::Relaxed), 6);
        // 3 single records (5+12 + 4+12 + 5+12 = 50 bytes) +
        // batch (5+12 + 7+12 + 4+12 = 52 bytes) = 102 bytes.
        assert_eq!(counts.append_bytes.load(Ordering::Relaxed), 102);

        // One leader-side fsync emitted.
        assert_eq!(counts.sync_calls.load(Ordering::Relaxed), 1);
        assert_eq!(counts.last_durable_lsn.load(Ordering::Relaxed), last.0);
    }

    #[test]
    fn observer_no_op_when_handle_built_without_observer() {
        // Sanity: a handle built without an observer must still
        // function correctly through every append + sync path.
        // Pre-0.9.2 baseline coverage stays green.
        let path = tmp_path("observer_absent");
        let _g = Cleanup(path.clone());
        let j = JournalHandle::open(&path).expect("open");
        assert!(j.observer.is_none());
        let _ = j.append(b"unobserved").expect("append");
        let _ = j.append_batch(&[b"a" as &[u8], b"b", b"c"]).expect("batch");
        j.sync_through(j.next_lsn()).expect("sync");
    }

    // ─────────────────────────────────────────────────────────
    // 0.9.4 — SyncMode + WriteLifetimeHint integration
    // ─────────────────────────────────────────────────────────

    #[test]
    fn sync_mode_full_round_trips_through_journal() {
        // Default SyncMode::Full uses file.sync_data() — the
        // pre-0.9.4 path. Bit-for-bit-preserved behaviour:
        // append, sync_through, observe synced_lsn advance.
        let path = tmp_path("sync_mode_full");
        let _g = Cleanup(path.clone());
        let opts = JournalOptions::new().sync_mode(options::SyncMode::Full);
        let j = JournalHandle::open_with_options(&path, opts).expect("open");
        let lsn = j.append(b"full-sync payload").expect("append");
        j.sync_through(lsn).expect("sync_through full");
        assert!(j.synced_lsn() >= lsn);
    }

    #[test]
    fn sync_mode_barrier_round_trips_through_journal() {
        // SyncMode::Barrier goes through platform::sync_barrier.
        // On Linux it's fdatasync (same path); on Windows it's a
        // no-op; on macOS it's F_BARRIERFSYNC. All three return
        // Ok on a healthy fs — the journal's sync_through must
        // complete and advance synced_lsn regardless of the
        // underlying primitive.
        let path = tmp_path("sync_mode_barrier");
        let _g = Cleanup(path.clone());
        let opts = JournalOptions::new().sync_mode(options::SyncMode::Barrier);
        let j = JournalHandle::open_with_options(&path, opts).expect("open");
        let lsn = j.append(b"barrier-sync payload").expect("append");
        j.sync_through(lsn).expect("sync_through barrier");
        assert!(j.synced_lsn() >= lsn);
    }

    #[test]
    fn write_lifetime_hint_open_succeeds_for_every_variant() {
        // The hint is advisory — every variant must succeed
        // through open_with_options regardless of whether the
        // host kernel / filesystem / drive actually honours it.
        // On Linux ≥ 4.13 the fcntl runs; on every other
        // platform it's a no-op. Either way, open must succeed
        // and the journal must function normally.
        for hint in [
            options::WriteLifetimeHint::Short,
            options::WriteLifetimeHint::Medium,
            options::WriteLifetimeHint::Long,
            options::WriteLifetimeHint::Extreme,
        ] {
            let path = tmp_path(&format!("rw_hint_{hint:?}"));
            let _g = Cleanup(path.clone());
            let opts = JournalOptions::new().write_lifetime_hint(Some(hint));
            let j = JournalHandle::open_with_options(&path, opts)
                .expect("open with write_lifetime_hint must succeed");
            let lsn = j.append(b"hint payload").expect("append");
            j.sync_through(lsn).expect("sync_through");
            assert!(j.synced_lsn() >= lsn);
        }
    }

    #[test]
    fn sync_mode_and_lifetime_hint_compose() {
        // Combining the two new options must work — no hidden
        // ordering dependency between them.
        let path = tmp_path("sync_and_hint");
        let _g = Cleanup(path.clone());
        let opts = JournalOptions::new()
            .sync_mode(options::SyncMode::Barrier)
            .write_lifetime_hint(Some(options::WriteLifetimeHint::Long));
        let j = JournalHandle::open_with_options(&path, opts).expect("open");
        let lsn = j.append(b"compose payload").expect("append");
        j.sync_through(lsn).expect("sync_through");
        assert!(j.synced_lsn() >= lsn);
    }

    #[test]
    fn group_commit_concurrent_sync_through() {
        use std::sync::Arc;
        let path = tmp_path("group_commit");
        let _g = Cleanup(path.clone());
        let j = Arc::new(JournalHandle::open(&path).expect("open"));
        let mut lsns = Vec::new();
        for i in 0..32 {
            let lsn = j.append(format!("rec {i:04}").as_bytes()).expect("append");
            lsns.push(lsn);
        }
        // Concurrent sync_through from many threads — should
        // coalesce into one fsync.
        let mut handles = Vec::new();
        for lsn in &lsns {
            let j = j.clone();
            let lsn = *lsn;
            handles.push(std::thread::spawn(move || {
                j.sync_through(lsn).expect("sync_through")
            }));
        }
        for h in handles {
            h.join().expect("join");
        }
        assert!(j.synced_lsn() >= *lsns.last().unwrap());
    }
}