fsys 1.1.0

Filesystem IO for Rust storage engines: journal substrate, io_uring, NVMe passthrough, atomic writes, cross-platform durability.
Documentation
//! 0.9.4 — Linux io_uring kernel-feature probe.
//!
//! Probes which of the elite-tier setup flags
//! (`IORING_SETUP_COOP_TASKRUN`, `IORING_SETUP_SINGLE_ISSUER`,
//! `IORING_SETUP_DEFER_TASKRUN`) the running kernel accepts, then
//! caches the result for the lifetime of the process. Ring
//! constructors elsewhere in the crate consult [`features()`] to
//! decide which flags to set on `io_uring::IoUring::builder()`
//! before calling `.build(queue_depth)`.
//!
//! ## Why probe once, cache forever
//!
//! Each `io_uring_setup(2)` call is a syscall — cheap, but
//! probing every time a ring is constructed wastes work. The
//! kernel cannot change feature support over the process
//! lifetime (a hot kernel upgrade would require a restart), so a
//! single probe at first ring construction is sufficient.
//!
//! ## Probe strategy
//!
//! We try a single ring construction with the most aggressive
//! flag set first. On `EINVAL` we strip the highest-version flag
//! and retry. The walk is:
//!
//! 1. `DEFER_TASKRUN | SINGLE_ISSUER | COOP_TASKRUN` (≥ 6.1)
//! 2. `SINGLE_ISSUER | COOP_TASKRUN`                 (≥ 6.0)
//! 3. `COOP_TASKRUN`                                 (≥ 5.19)
//! 4. (no elite flags)                               (≤ 5.18)
//!
//! `DEFER_TASKRUN` is documented to **require** `SINGLE_ISSUER`,
//! so the two are tested together — there's no useful intermediate.
//!
//! ## What this is not
//!
//! - **Not** a probe for `IORING_SETUP_SQPOLL` or
//!   `IORING_SETUP_IOPOLL`. Both require dedicated cores /
//!   privilege configurations that vary too much per deployment
//!   to enable by default; future patches may add opt-in
//!   `Builder` knobs.
//! - **Not** a probe for `IORING_REGISTER_FILES` /
//!   `IORING_REGISTER_BUFFERS`. Those are register-time, not
//!   setup-time; the ring construction succeeds regardless and
//!   the registration call decides feature support at use time.
//!
//! ## Test surface
//!
//! `cargo test --lib platform::iouring_features` validates the
//! probe runs without panicking and produces a coherent
//! [`IoUringFeatures`] value (every probed flag is independently
//! `bool`-typed; no impossible combinations are produced because
//! `defer_taskrun ⟹ single_issuer` is enforced by the probe).

#![cfg(target_os = "linux")]

use std::sync::OnceLock;

/// Cached snapshot of which io_uring kernel features the host
/// supports. Populated on first call to [`features`]; immutable
/// thereafter.
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub(crate) struct IoUringFeatures {
    /// `IORING_SETUP_COOP_TASKRUN` — cooperative task work
    /// (kernel ≥ 5.19). Reduces inter-processor interrupts on
    /// completion delivery. Safe to enable on any ring; no
    /// runtime conditions.
    pub coop_taskrun: bool,
    /// `IORING_SETUP_SINGLE_ISSUER` — single-issuer hint
    /// (kernel ≥ 6.0). The kernel enforces that all submissions
    /// come from one task; violations fail with `EEXIST`. The
    /// fsys design satisfies this naturally — each ring has a
    /// dedicated owner thread/task, and the submitter is
    /// always the same thread.
    pub single_issuer: bool,
    /// `IORING_SETUP_DEFER_TASKRUN` — defer task work to
    /// submit-time (kernel ≥ 6.1). The application is
    /// responsible for periodically calling
    /// `io_uring_enter(2)` so completions get processed. The
    /// fsys design satisfies this through `submit_and_wait`
    /// (sync path) and the eventfd-driven completion loop
    /// (async path). Requires `single_issuer` (enforced by the
    /// kernel; this struct also enforces it).
    pub defer_taskrun: bool,
}

/// Returns the cached io_uring kernel-feature snapshot, probing
/// on first call.
///
/// The probe runs a single `io_uring_setup(2)` call with the
/// most aggressive flag set the kernel might accept, then strips
/// flags on `EINVAL` and retries. The probe itself opens and
/// immediately closes the ring; no resources are held across
/// calls.
///
/// Returns [`IoUringFeatures::default`] (all-false) on hosts
/// where every probed flag is rejected — the fallback behaviour
/// is identical to pre-0.9.4 (vanilla `IoUring::new`).
pub(crate) fn features() -> IoUringFeatures {
    static CACHE: OnceLock<IoUringFeatures> = OnceLock::new();
    *CACHE.get_or_init(probe)
}

/// Synchronous probe. Tries the most aggressive flag combination
/// first; strips on `EINVAL`. Always returns within microseconds
/// (each `io_uring_setup` is a single syscall).
fn probe() -> IoUringFeatures {
    // 0.9.7 H-9 — test-hook env-var bypass.
    //
    // Audit H-9: the elite-flag fallback paths (kernel < 5.19 →
    // no COOP_TASKRUN; kernel < 6.0 → no SINGLE_ISSUER; etc.)
    // weren't explicitly tested in CI — only the happy path on
    // whatever kernel the runner happened to provide. Adding
    // mockable feature gates is the fix.
    //
    // `FSYS_TEST_FORCE_NO_IOURING_FEATURES=1` forces this probe
    // to return [`IoUringFeatures::default`] (all-false) without
    // touching the kernel, exercising the pre-0.9.4 baseline
    // path in tests on any kernel.
    //
    // The env-var name is intentionally obscure to make
    // accidental triggering in production environments
    // vanishingly unlikely. The check runs once per process
    // (this function is called from a `OnceLock::get_or_init`),
    // so production cost is one [`env::var_os`] call ever ≈ 1 µs
    // on the first ring construction. After that, the cached
    // value is returned with zero cost.
    if std::env::var_os("FSYS_TEST_FORCE_NO_IOURING_FEATURES").is_some() {
        return IoUringFeatures::default();
    }

    // Tier 1 — DEFER_TASKRUN (6.1+) requires SINGLE_ISSUER, and
    // pairs naturally with COOP_TASKRUN. `let _ = ` consumes the
    // chained `&mut Builder` return so the crate's `unused_results`
    // lint is satisfied; the builder mutation is the side effect
    // we want.
    if try_build(|b| {
        let _ = b
            .setup_defer_taskrun()
            .setup_single_issuer()
            .setup_coop_taskrun();
    }) {
        return IoUringFeatures {
            coop_taskrun: true,
            single_issuer: true,
            defer_taskrun: true,
        };
    }

    // Tier 2 — SINGLE_ISSUER (6.0+) + COOP_TASKRUN.
    if try_build(|b| {
        let _ = b.setup_single_issuer().setup_coop_taskrun();
    }) {
        return IoUringFeatures {
            coop_taskrun: true,
            single_issuer: true,
            defer_taskrun: false,
        };
    }

    // Tier 3 — COOP_TASKRUN (5.19+) alone.
    if try_build(|b| {
        let _ = b.setup_coop_taskrun();
    }) {
        return IoUringFeatures {
            coop_taskrun: true,
            single_issuer: false,
            defer_taskrun: false,
        };
    }

    // Tier 4 — no elite flags. This is the pre-0.9.4 baseline.
    IoUringFeatures::default()
}

/// Tries building a tiny (queue-depth 4) ring with the flags
/// applied by `cfg`. Returns `true` if construction succeeded,
/// `false` otherwise. The ring is dropped immediately.
fn try_build<F>(cfg: F) -> bool
where
    F: FnOnce(&mut io_uring::Builder),
{
    let mut builder = io_uring::IoUring::builder();
    cfg(&mut builder);
    builder.build(4).is_ok()
}

/// Ring usage mode — selects which elite flags are safe to apply.
///
/// The 0.9.4 elite-flag set (`COOP_TASKRUN` / `SINGLE_ISSUER` /
/// `DEFER_TASKRUN`) was originally applied unconditionally, but
/// **two** of them turned out to be incompatible with the async
/// substrate's design:
///
/// 1. **`DEFER_TASKRUN`** (kernel ≥ 6.1) requires the application
///    to drive completion processing via explicit
///    `io_uring_enter(IORING_ENTER_GETEVENTS)` calls — the kernel
///    will NOT process completions in background task work. The
///    async substrate submits via `ring.submit()` (no GETEVENTS)
///    and sleeps on `AsyncFd::readable()` waiting for eventfd
///    signalling. Under `DEFER_TASKRUN`, the kernel never posts
///    CQEs, eventfd never fires, the async loop hangs forever.
///
/// 2. **`SINGLE_ISSUER`** (kernel ≥ 6.0) is kernel-enforced —
///    only the task that owns the ring may submit. The kernel's
///    notion of "task" is an OS-level thread (TID). Under tokio's
///    `current_thread` runtime this is safe — the owner-task
///    always runs on the same OS thread. Under tokio's
///    `multi_thread` runtime, work-stealing migrates tasks
///    between worker threads at every yield point. After
///    migration, the owner task submits from a different TID
///    than the one that called `io_uring_setup`, the kernel
///    returns `-EEXIST`, the SQE is never processed, the CQE is
///    never posted, and the async loop hangs.
///
/// The sync owner-thread ring (`linux_iouring.rs`) is unaffected
/// by either: it uses `submit_and_wait(n)` (= `io_uring_enter(GETEVENTS=n)`)
/// which drives completions explicitly, and runs on a dedicated
/// `std::thread::spawn`-ed OS thread that never migrates.
///
/// Both bugs existed since 0.9.4 but were undetected until the
/// 0.9.6 feature-matrix CI exercised the async tests on a kernel
/// with the flags supported (≥ 6.1).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(dead_code)] // Variant usage is feature-gated: Sync used by
                    // linux_iouring.rs (cfg target_os=linux); Async used by
                    // completion_driver.rs (cfg target_os=linux + feature=async).
                    // The default-features Linux build constructs Sync only,
                    // so dead_code would fire on Async without this allow.
pub(crate) enum RingMode {
    /// Sync owner-thread ring (`linux_iouring.rs`). Uses
    /// `submit_and_wait(n)` (compatible with `DEFER_TASKRUN`) and
    /// runs on a dedicated `std::thread::spawn`-ed OS thread that
    /// never migrates (compatible with `SINGLE_ISSUER`). All elite
    /// flags supported by the kernel are applied.
    Sync,
    /// Async substrate (`completion_driver.rs`). Uses eventfd
    /// signalling (INCOMPATIBLE with `DEFER_TASKRUN`) and is a
    /// tokio task that work-stealing may migrate between worker
    /// threads under the `multi_thread` runtime (INCOMPATIBLE
    /// with `SINGLE_ISSUER`). Only `COOP_TASKRUN` is applied —
    /// it's a pure performance hint with no enforcement
    /// requirements.
    Async,
}

/// Applies the cached feature set to an `io_uring::Builder`,
/// enabling exactly the flags that the host kernel supports AND
/// that are safe for the requested [`RingMode`].
///
/// Callers use this from their ring constructors:
///
/// ```text
/// let mut b = io_uring::IoUring::builder();
/// iouring_features::apply(&mut b, RingMode::Sync);
/// let ring = b.build(queue_depth)?;
/// ```
///
/// The builder is mutated in place; the caller retains
/// ownership and may chain additional setup methods after this
/// call. Idempotent — calling `apply` twice is a no-op (each
/// flag is set once at the bit level).
pub(crate) fn apply(builder: &mut io_uring::Builder, mode: RingMode) {
    let f = features();
    // COOP_TASKRUN (5.19+) is a pure performance hint: it tells
    // the kernel to defer completion task work until a convenient
    // moment (typically the next user-space transition), reducing
    // IPIs. It does NOT enforce any task-identity or completion-
    // driving constraint, so it's safe for both Sync and Async
    // ring modes.
    if f.coop_taskrun {
        let _ = builder.setup_coop_taskrun();
    }
    // SINGLE_ISSUER (6.0+) is kernel-enforced — only the task
    // that owns the ring may submit. Safe for Sync (dedicated OS
    // thread); incompatible with Async (tokio task migrates
    // under multi_thread runtime — see `RingMode::Async` doc).
    if f.single_issuer && matches!(mode, RingMode::Sync) {
        let _ = builder.setup_single_issuer();
    }
    // DEFER_TASKRUN (6.1+) requires SINGLE_ISSUER (kernel-
    // enforced) AND explicit `io_uring_enter(GETEVENTS)` driving
    // by the app. Same Sync-only restriction as SINGLE_ISSUER.
    if f.defer_taskrun && f.single_issuer && matches!(mode, RingMode::Sync) {
        let _ = builder.setup_defer_taskrun();
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    /// The probe is allowed to return any combination, but it
    /// must respect the kernel's `defer_taskrun ⟹ single_issuer`
    /// constraint — otherwise `apply` would build a ring the
    /// kernel will reject.
    #[test]
    fn defer_taskrun_implies_single_issuer() {
        let f = features();
        if f.defer_taskrun {
            assert!(
                f.single_issuer,
                "DEFER_TASKRUN reported without SINGLE_ISSUER — \
                 the kernel will reject a ring built this way"
            );
        }
    }

    /// `features()` must be a pure cache after the first call —
    /// every subsequent call returns the same value. We can't
    /// observe the cache directly, but we can assert equality
    /// across calls.
    #[test]
    fn features_are_cached_and_stable() {
        let first = features();
        for _ in 0..16 {
            assert_eq!(features(), first);
        }
    }

    /// `apply` must succeed without panicking on any feature
    /// set; the builder is left in a valid state and the
    /// caller can still call `.build`. Verified for both
    /// `RingMode::Sync` and `RingMode::Async`.
    #[test]
    fn apply_does_not_panic_and_builds_sync() {
        let mut b = io_uring::IoUring::builder();
        apply(&mut b, RingMode::Sync);
        let result = b.build(4);
        assert!(
            result.is_ok(),
            "apply(Sync) produced an unbuildable ring on this host: {:?}",
            result.err()
        );
    }

    #[test]
    fn apply_does_not_panic_and_builds_async() {
        let mut b = io_uring::IoUring::builder();
        apply(&mut b, RingMode::Async);
        let result = b.build(4);
        assert!(
            result.is_ok(),
            "apply(Async) produced an unbuildable ring on this host: {:?}",
            result.err()
        );
    }
}