Skip to main content

vyre_runtime/uring/
gpudirect.rs

1//! GPUDirect Storage capability probe + passthrough helper.
2//!
3//! NVMe โ†’ VRAM direct DMA on Linux with nvidia-fs installed + kernel
4//! 6.0+. Callers probe the capability at startup via
5//! [`GpuDirectCapability::probe`]; if present, they use
6//! [`encode_nvme_read_sqe`] to pack a 64-byte NVMe Read command that
7//! `AsyncUringStream::submit_nvme_passthrough` feeds into
8//! `IORING_OP_URING_CMD`. Bytes land in a
9//! `GpuMappedBuffer::from_bar1_peer`-backed region with zero host
10//! bounce.
11//!
12//! Gated behind the `uring-cmd-nvme` feature  -  the module is
13//! compiled even without the feature so consumers can read
14//! [`GpuDirectCapability::probe`] and get a structured
15//! `Disabled` result instead of a link error.
16
17#[cfg(all(target_os = "linux", feature = "uring-cmd-nvme"))]
18use std::fs;
19#[cfg(all(target_os = "linux", feature = "uring-cmd-nvme"))]
20use std::io::{ErrorKind, Read as _};
21
22#[cfg(all(target_os = "linux", feature = "uring-cmd-nvme"))]
23const MAX_NVIDIA_FS_STATS_BYTES: u64 = 1024 * 1024;
24
25/// Result of probing the host for GPUDirect Storage support.
26#[derive(Debug, Clone, PartialEq, Eq)]
27pub enum GpuDirectCapability {
28    /// The host exposes nvidia-fs and the kernel reports the
29    /// driver as enabled.
30    Available {
31        /// Raw contents of `/proc/driver/nvidia-fs/stats` at probe
32        /// time. Useful for diagnostics; the caller is free to
33        /// parse it further.
34        stats: String,
35    },
36    /// Probe ran but nvidia-fs isn't installed or the driver is
37    /// disabled. Callers fall back to `IORING_OP_READ_FIXED` into
38    /// host-visible GPU memory  -  still zero-copy past the PCIe
39    /// root complex, but not bypassed.
40    Unavailable {
41        /// Why the capability isn't present.
42        reason: &'static str,
43    },
44    /// The `uring-cmd-nvme` feature is compiled out; no GPUDirect
45    /// probe ran. Loudly surface this so a caller that explicitly
46    /// expected the fast path sees the config mismatch.
47    FeatureDisabled,
48}
49
50impl GpuDirectCapability {
51    /// Probe the host.
52    ///
53    /// Reads `/proc/driver/nvidia-fs/stats`. Presence of the file +
54    /// non-empty contents = `Available`. A file-not-found or
55    /// permission error = `Unavailable` with a structured reason.
56    /// Non-Linux / feature-disabled hosts return `FeatureDisabled`.
57    #[must_use]
58    pub fn probe() -> Self {
59        #[cfg(not(all(target_os = "linux", feature = "uring-cmd-nvme")))]
60        {
61            GpuDirectCapability::FeatureDisabled
62        }
63
64        #[cfg(all(target_os = "linux", feature = "uring-cmd-nvme"))]
65        match read_nvidia_fs_stats() {
66            Ok(stats) if !stats.trim().is_empty() => {
67                GpuDirectCapability::Available { stats }
68            }
69            Ok(_) => GpuDirectCapability::Unavailable {
70                reason: "nvidia-fs stats file is empty; driver reports no GPUDirect sessions",
71            },
72            Err(err) if err.kind() == ErrorKind::NotFound => GpuDirectCapability::Unavailable {
73                reason: "/proc/driver/nvidia-fs/stats not found; nvidia-fs is not installed",
74            },
75            Err(err) if err.kind() == ErrorKind::PermissionDenied => GpuDirectCapability::Unavailable {
76                reason: "/proc/driver/nvidia-fs/stats refused permission; run with adequate privileges",
77            },
78            Err(_) => GpuDirectCapability::Unavailable {
79                reason: "/proc/driver/nvidia-fs/stats read failed for an unexpected reason",
80            },
81        }
82    }
83
84    /// True when the fast path is available and callers should
85    /// construct a `GpuMappedBuffer::from_bar1_peer`-backed region.
86    #[must_use]
87    pub fn is_available(&self) -> bool {
88        matches!(self, GpuDirectCapability::Available { .. })
89    }
90}
91
92#[cfg(all(target_os = "linux", feature = "uring-cmd-nvme"))]
93fn read_nvidia_fs_stats() -> std::io::Result<String> {
94    let mut file = fs::File::open("/proc/driver/nvidia-fs/stats")?;
95    let mut stats = String::new();
96    file.by_ref()
97        .take(MAX_NVIDIA_FS_STATS_BYTES + 1)
98        .read_to_string(&mut stats)?;
99    let stats_len = u64::try_from(stats.len()).map_err(|error| {
100        std::io::Error::new(
101            std::io::ErrorKind::InvalidData,
102            format!("nvidia-fs stats length cannot fit u64: {error}"),
103        )
104    })?;
105    if stats_len > MAX_NVIDIA_FS_STATS_BYTES {
106        return Err(std::io::Error::new(
107            std::io::ErrorKind::InvalidData,
108            "nvidia-fs stats exceeded bounded read limit",
109        ));
110    }
111    Ok(stats)
112}
113
114/// NVMe `0x02` Read opcode (see NVMe Base Spec 1.4, ยง5.15).
115pub const NVME_CMD_READ: u8 = 0x02;
116
117/// Encode a 64-byte NVMe Read command payload suitable for
118/// `AsyncUringStream::submit_nvme_passthrough`. Callers supply
119/// the target LBA range + the destination BAR1 peer pointer; the
120/// kernel DMAs the blocks directly into VRAM.
121///
122/// # NVMe passthrough layout
123///
124/// ```text
125/// byte  0..4 : cmd_op (NVME_CMD_READ)
126/// byte  4..8 : nsid    (namespace id, commonly 1)
127/// byte  8..16: reserved
128/// byte 16..24: reserved
129/// byte 24..32: reserved (metadata ptr)
130/// byte 32..40: dest_ptr (BAR1 peer  -  VRAM)
131/// byte 40..48: starting LBA (little-endian u64)
132/// byte 48..52: number_of_blocks (zero-based, so `blocks - 1`)
133/// byte 52..56: dsmgmt
134/// byte 56..60: reserved
135/// byte 60..64: reserved
136/// ```
137///
138/// The helper zeroes reserved regions defensively so forging is
139/// harder. The caller retains responsibility for validating lba +
140/// blocks against the namespace's capacity.
141#[must_use]
142pub fn encode_nvme_read_sqe(
143    namespace_id: u32,
144    starting_lba: u64,
145    blocks: u32,
146    dest_bar1_ptr: u64,
147) -> [u8; 64] {
148    assert!(
149        blocks > 0,
150        "NVMe read SQE cannot encode zero blocks; validate read length before submitting GPU-direct ingest"
151    );
152    let mut buf = [0u8; 64];
153    buf[0] = NVME_CMD_READ;
154    buf[4..8].copy_from_slice(&namespace_id.to_le_bytes());
155    buf[32..40].copy_from_slice(&dest_bar1_ptr.to_le_bytes());
156    buf[40..48].copy_from_slice(&starting_lba.to_le_bytes());
157    // NVMe encodes "number of logical blocks" as zero-based: 0 = 1 block.
158    let zero_based = blocks - 1;
159    buf[48..52].copy_from_slice(&zero_based.to_le_bytes());
160    buf
161}
162
163#[cfg(test)]
164mod tests {
165    use super::*;
166
167    #[test]
168    fn probe_returns_a_structured_variant() {
169        // We don't assert which variant  -  depends on host  -  but we
170        // do assert the probe never panics and returns one of the
171        // defined variants.
172        match GpuDirectCapability::probe() {
173            GpuDirectCapability::Available { .. } => {}
174            GpuDirectCapability::Unavailable { .. } => {}
175            GpuDirectCapability::FeatureDisabled => {}
176        }
177    }
178
179    #[test]
180    fn encode_nvme_read_sqe_layout_matches_spec() {
181        let sqe = encode_nvme_read_sqe(
182            /* nsid = */ 1,
183            /* starting_lba = */ 0x1122_3344_5566_7788,
184            /* blocks = */ 8,
185            /* dest = */ 0xAABB_CCDD_EEFF_0011,
186        );
187        assert_eq!(sqe[0], NVME_CMD_READ);
188        assert_eq!(sqe[4..8], 1u32.to_le_bytes());
189        assert_eq!(sqe[32..40], 0xAABB_CCDD_EEFF_0011u64.to_le_bytes());
190        assert_eq!(sqe[40..48], 0x1122_3344_5566_7788u64.to_le_bytes());
191        // 8 blocks โ†’ 7 zero-based.
192        assert_eq!(sqe[48..52], 7u32.to_le_bytes());
193        // Reserved regions stay zero.
194        assert_eq!(&sqe[8..32], &[0u8; 24]);
195        assert_eq!(&sqe[52..64], &[0u8; 12]);
196    }
197
198    #[test]
199    fn encode_nvme_single_block_yields_zero_in_nblocks_field() {
200        // NVMe's zero-based encoding: 1 block โ†’ 0.
201        let sqe = encode_nvme_read_sqe(1, 0, 1, 0);
202        assert_eq!(sqe[48..52], 0u32.to_le_bytes());
203    }
204
205    #[test]
206    fn is_available_reflects_variant() {
207        let available = GpuDirectCapability::Available {
208            stats: "session_count=1".into(),
209        };
210        assert!(available.is_available());
211        let unavail = GpuDirectCapability::Unavailable {
212            reason: "test reason",
213        };
214        assert!(!unavail.is_available());
215        assert!(!GpuDirectCapability::FeatureDisabled.is_available());
216    }
217}