#![expect(
clippy::cast_sign_loss,
clippy::cast_possible_truncation,
clippy::cast_possible_wrap,
clippy::cast_ptr_alignment,
reason = "Linux syscall/mmap ABI: register-width arg casts + page-aligned ring field reads"
)]
use alloc::boxed::Box;
use alloc::ffi::CString;
#[cfg(not(feature = "std"))]
use alloc::format;
use alloc::string::{String, ToString};
use alloc::sync::Arc;
use alloc::vec::Vec;
use core::sync::atomic::{Ordering, fence};
use spin::Mutex;
use syscalls::{Errno, Sysno, syscall1, syscall2, syscall3, syscall4, syscall5, syscall6};
use crate::fs::{Fs, FsDirEntry, FsFile, FsMetadata, FsOpenOptions};
use crate::io::{Error, ErrorKind, SeekFrom};
use crate::path::Path;
const IORING_OFF_SQ_RING: u64 = 0;
const IORING_OFF_CQ_RING: u64 = 0x0800_0000;
const IORING_OFF_SQES: u64 = 0x1000_0000;
const IORING_ENTER_GETEVENTS: u32 = 1;
const IORING_FEAT_SINGLE_MMAP: u32 = 1;
const IORING_OP_NOP: u8 = 0;
const IORING_OP_FSYNC: u8 = 3;
const IORING_OP_READ: u8 = 22;
const IORING_OP_WRITE: u8 = 23;
const IORING_FSYNC_DATASYNC: u32 = 1;
const AT_FDCWD: i32 = -100;
pub const O_RDONLY: i32 = 0;
pub const O_WRONLY: i32 = 1;
pub const O_RDWR: i32 = 2;
#[cfg(not(any(
target_arch = "mips",
target_arch = "mips32r6",
target_arch = "mips64",
target_arch = "mips64r6",
target_arch = "sparc",
target_arch = "sparc64"
)))]
pub const O_CREAT: i32 = 0o100;
#[cfg(any(
target_arch = "mips",
target_arch = "mips32r6",
target_arch = "mips64",
target_arch = "mips64r6"
))]
pub const O_CREAT: i32 = 0x100;
#[cfg(any(target_arch = "sparc", target_arch = "sparc64"))]
pub const O_CREAT: i32 = 0x200;
#[cfg(not(any(target_arch = "sparc", target_arch = "sparc64")))]
pub const O_TRUNC: i32 = 0o1000;
#[cfg(any(target_arch = "sparc", target_arch = "sparc64"))]
pub const O_TRUNC: i32 = 0x400;
const PROT_READ: usize = 0x1;
const PROT_WRITE: usize = 0x2;
const MAP_SHARED: usize = 0x01;
const MAP_POPULATE: usize = 0x0_8000;
#[repr(C)]
#[derive(Default, Clone, Copy)]
struct SqRingOffsets {
head: u32,
tail: u32,
ring_mask: u32,
ring_entries: u32,
flags: u32,
dropped: u32,
array: u32,
resv1: u32,
resv2: u64,
}
#[repr(C)]
#[derive(Default, Clone, Copy)]
struct CqRingOffsets {
head: u32,
tail: u32,
ring_mask: u32,
ring_entries: u32,
overflow: u32,
cqes: u32,
flags: u32,
resv1: u32,
resv2: u64,
}
#[repr(C)]
#[derive(Default, Clone, Copy)]
struct IoUringParams {
sq_entries: u32,
cq_entries: u32,
flags: u32,
sq_thread_cpu: u32,
sq_thread_idle: u32,
features: u32,
wq_fd: u32,
resv: [u32; 3],
sq_off: SqRingOffsets,
cq_off: CqRingOffsets,
}
#[repr(C)]
#[derive(Default, Clone, Copy)]
struct IoUringSqe {
opcode: u8,
flags: u8,
ioprio: u16,
fd: i32,
off: u64,
addr: u64,
len: u32,
op_flags: u32,
user_data: u64,
buf_index: u16,
personality: u16,
splice_fd_in: i32,
pad2: [u64; 2],
}
#[repr(C)]
#[derive(Default, Clone, Copy)]
struct IoUringCqe {
user_data: u64,
res: i32,
flags: u32,
}
fn errno_to_kind(errno: i32) -> ErrorKind {
match errno {
1 | 13 => ErrorKind::PermissionDenied, 2 => ErrorKind::NotFound, 4 => ErrorKind::Interrupted, 9 | 22 => ErrorKind::InvalidInput, 11 => ErrorKind::WouldBlock, 17 => ErrorKind::AlreadyExists, 95 => ErrorKind::Unsupported, _ => ErrorKind::Other,
}
}
fn err(syscall: &str, e: Errno) -> Error {
let raw = e.into_raw();
Error::new(errno_to_kind(raw), format!("{syscall} failed: errno {raw}"))
}
unsafe fn io_uring_setup(entries: u32, params: *mut IoUringParams) -> Result<i32, Error> {
let fd = unsafe { syscall2(Sysno::io_uring_setup, entries as usize, params as usize) }
.map_err(|e| err("io_uring_setup", e))?;
Ok(fd as i32)
}
unsafe fn io_uring_enter(
fd: i32,
to_submit: u32,
min_complete: u32,
flags: u32,
) -> Result<u32, Error> {
let n = unsafe {
syscall6(
Sysno::io_uring_enter,
fd as usize,
to_submit as usize,
min_complete as usize,
flags as usize,
0,
0,
)
}
.map_err(|e| err("io_uring_enter", e))?;
Ok(n as u32)
}
#[cfg_attr(coverage_nightly, coverage(off))]
unsafe fn mmap(
len: usize,
prot: usize,
flags: usize,
fd: i32,
offset: u64,
) -> Result<*mut u8, Error> {
let ret = unsafe {
syscall6(
Sysno::mmap,
0,
len,
prot,
flags,
fd as usize,
offset as usize,
)
};
match ret {
Ok(addr) => Ok(addr as *mut u8),
Err(e) => Err(err("mmap", e)),
}
}
unsafe fn munmap(addr: *mut u8, len: usize) {
let _ = unsafe { syscall2(Sysno::munmap, addr as usize, len) };
}
unsafe fn close(fd: i32) {
let _ = unsafe { syscall1(Sysno::close, fd as usize) };
}
pub fn open_raw(path: &core::ffi::CStr, flags: i32, mode: u32) -> Result<i32, Error> {
let fd = unsafe {
syscall4(
Sysno::openat,
AT_FDCWD as usize,
path.as_ptr() as usize,
flags as usize,
mode as usize,
)
}
.map_err(|e| err("openat", e))?;
Ok(fd as i32)
}
pub fn close_raw(fd: i32) -> Result<(), Error> {
unsafe { syscall1(Sysno::close, fd as usize) }.map_err(|e| err("close", e))?;
Ok(())
}
const SEEK_END: i32 = 2;
const LOCK_EX: i32 = 2;
const LOCK_NB: i32 = 4;
const AT_EMPTY_PATH: i32 = 0x1000;
const STATX_BASIC_STATS: u32 = 0x0000_07ff;
const S_IFMT: u16 = 0o170_000;
const S_IFDIR: u16 = 0o040_000;
const S_IFREG: u16 = 0o100_000;
#[repr(C)]
#[derive(Default, Clone, Copy)]
struct StatxTimestamp {
tv_sec: i64,
tv_nsec: u32,
_reserved: i32,
}
#[repr(C)]
#[derive(Default, Clone, Copy)]
struct Statx {
stx_mask: u32,
stx_blksize: u32,
stx_attributes: u64,
stx_nlink: u32,
stx_uid: u32,
stx_gid: u32,
stx_mode: u16,
_spare0: u16,
stx_ino: u64,
stx_size: u64,
stx_blocks: u64,
stx_attributes_mask: u64,
stx_atime: StatxTimestamp,
stx_btime: StatxTimestamp,
stx_ctime: StatxTimestamp,
stx_mtime: StatxTimestamp,
stx_rdev_major: u32,
stx_rdev_minor: u32,
stx_dev_major: u32,
stx_dev_minor: u32,
stx_mnt_id: u64,
stx_dio_mem_align: u32,
stx_dio_offset_align: u32,
_spare3: [u64; 12],
}
pub struct RawMetadata {
pub size: u64,
pub is_dir: bool,
pub is_file: bool,
}
pub fn fstat_raw(fd: i32) -> Result<RawMetadata, Error> {
let mut buf = Statx::default();
let empty: &core::ffi::CStr = c"";
unsafe {
syscall5(
Sysno::statx,
fd as usize,
empty.as_ptr() as usize,
AT_EMPTY_PATH as usize,
STATX_BASIC_STATS as usize,
&raw mut buf as usize,
)
}
.map_err(|e| err("statx", e))?;
let kind = buf.stx_mode & S_IFMT;
Ok(RawMetadata {
size: buf.stx_size,
is_dir: kind == S_IFDIR,
is_file: kind == S_IFREG,
})
}
pub fn ftruncate_raw(fd: i32, length: u64) -> Result<(), Error> {
unsafe { syscall2(Sysno::ftruncate, fd as usize, length as usize) }
.map_err(|e| err("ftruncate", e))?;
Ok(())
}
pub fn lseek_raw(fd: i32, offset: i64, whence: i32) -> Result<u64, Error> {
let pos = unsafe { syscall3(Sysno::lseek, fd as usize, offset as usize, whence as usize) }
.map_err(|e| err("lseek", e))?;
Ok(pos as u64)
}
pub fn flock_exclusive_raw(fd: i32, non_blocking: bool) -> Result<bool, Error> {
let op = if non_blocking {
LOCK_EX | LOCK_NB
} else {
LOCK_EX
};
loop {
match unsafe { syscall2(Sysno::flock, fd as usize, op as usize) } {
Ok(_) => return Ok(true),
Err(e) if e.into_raw() == 4 => {}
Err(e) if non_blocking && e.into_raw() == 11 => return Ok(false),
Err(e) => return Err(err("flock", e)),
}
}
}
pub fn file_size_raw(fd: i32) -> Result<u64, Error> {
lseek_raw(fd, 0, SEEK_END)
}
const O_APPEND: i32 = 0o2000;
const O_EXCL: i32 = 0o200;
const O_DIRECTORY: i32 = 0o200_000;
const AT_REMOVEDIR: i32 = 0x200;
const DT_DIR: u8 = 4;
pub fn mkdirat_raw(path: &core::ffi::CStr, mode: u32) -> Result<(), Error> {
unsafe {
syscall3(
Sysno::mkdirat,
AT_FDCWD as usize,
path.as_ptr() as usize,
mode as usize,
)
}
.map_err(|e| err("mkdirat", e))?;
Ok(())
}
pub fn unlinkat_raw(path: &core::ffi::CStr, remove_dir: bool) -> Result<(), Error> {
let flags = if remove_dir { AT_REMOVEDIR } else { 0 };
unsafe {
syscall3(
Sysno::unlinkat,
AT_FDCWD as usize,
path.as_ptr() as usize,
flags as usize,
)
}
.map_err(|e| err("unlinkat", e))?;
Ok(())
}
pub fn renameat2_raw(from: &core::ffi::CStr, to: &core::ffi::CStr) -> Result<(), Error> {
unsafe {
syscall5(
Sysno::renameat2,
AT_FDCWD as usize,
from.as_ptr() as usize,
AT_FDCWD as usize,
to.as_ptr() as usize,
0,
)
}
.map_err(|e| err("renameat2", e))?;
Ok(())
}
pub fn statx_path_raw(path: &core::ffi::CStr) -> Result<Option<RawMetadata>, Error> {
let mut buf = Statx::default();
let r = unsafe {
syscall5(
Sysno::statx,
AT_FDCWD as usize,
path.as_ptr() as usize,
0,
STATX_BASIC_STATS as usize,
&raw mut buf as usize,
)
};
match r {
Ok(_) => {
let kind = buf.stx_mode & S_IFMT;
Ok(Some(RawMetadata {
size: buf.stx_size,
is_dir: kind == S_IFDIR,
is_file: kind == S_IFREG,
}))
}
Err(e) if e.into_raw() == 2 => Ok(None),
Err(e) => Err(err("statx", e)),
}
}
#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
#[repr(C)]
#[derive(Default, Clone, Copy)]
#[allow(
clippy::struct_field_names,
reason = "field names mirror the kernel `struct statfs` (f_type, f_bsize, …) verbatim"
)]
struct Statfs {
f_type: i64,
f_bsize: i64,
f_blocks: u64,
f_bfree: u64,
f_bavail: u64,
f_files: u64,
f_ffree: u64,
f_fsid: [i32; 2],
f_namelen: i64,
f_frsize: i64,
f_flags: i64,
f_spare: [i64; 4],
}
#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
pub fn statfs_available_raw(path: &core::ffi::CStr) -> Result<u64, Error> {
let mut buf = Statfs::default();
unsafe { syscall2(Sysno::statfs, path.as_ptr() as usize, &raw mut buf as usize) }
.map_err(|e| err("statfs", e))?;
let frsize = buf.f_frsize as u64;
Ok(buf.f_bavail.saturating_mul(frsize))
}
unsafe impl Send for IoUringRaw {}
pub struct IoUringRawFile {
ring: Arc<Mutex<IoUringRaw>>,
fd: i32,
cursor: u64,
is_append: bool,
}
impl IoUringRawFile {
#[must_use]
pub fn new(ring: Arc<Mutex<IoUringRaw>>, fd: i32, is_append: bool) -> Self {
Self {
ring,
fd,
cursor: 0,
is_append,
}
}
fn offset_add(base: u64, delta: i64) -> Result<u64, Error> {
let r = if delta >= 0 {
base.checked_add(delta as u64)
} else {
base.checked_sub(delta.unsigned_abs())
};
r.ok_or_else(|| Error::new(ErrorKind::InvalidInput, "seek position out of range"))
}
fn read_impl(&mut self, buf: &mut [u8]) -> Result<usize, Error> {
if buf.is_empty() {
return Ok(0);
}
let n = self.ring.lock().read_at(self.fd, buf, self.cursor)?;
self.cursor += n as u64;
Ok(n)
}
fn write_impl(&mut self, buf: &[u8]) -> Result<usize, Error> {
if buf.is_empty() {
return Ok(0);
}
if self.is_append {
return self.ring.lock().write_at(self.fd, buf, u64::MAX);
}
let n = self.ring.lock().write_at(self.fd, buf, self.cursor)?;
self.cursor += n as u64;
Ok(n)
}
fn seek_impl(&mut self, pos: SeekFrom) -> Result<u64, Error> {
let target = match pos {
SeekFrom::Start(o) => o,
SeekFrom::Current(d) => Self::offset_add(self.cursor, d)?,
SeekFrom::End(d) => Self::offset_add(file_size_raw(self.fd)?, d)?,
};
self.cursor = target;
Ok(target)
}
}
impl Drop for IoUringRawFile {
fn drop(&mut self) {
let _ = close_raw(self.fd);
}
}
impl FsFile for IoUringRawFile {
fn sync_all(&self) -> crate::io::Result<()> {
self.ring.lock().fsync(self.fd, false)
}
fn sync_data(&self) -> crate::io::Result<()> {
self.ring.lock().fsync(self.fd, true)
}
fn metadata(&self) -> crate::io::Result<FsMetadata> {
let m = fstat_raw(self.fd)?;
Ok(FsMetadata {
len: m.size,
is_dir: m.is_dir,
is_file: m.is_file,
})
}
fn set_len(&self, size: u64) -> crate::io::Result<()> {
ftruncate_raw(self.fd, size)
}
fn read_at(&self, buf: &mut [u8], offset: u64) -> crate::io::Result<usize> {
if buf.is_empty() {
return Ok(0);
}
let mut total = 0usize;
while total < buf.len() {
let remaining = buf
.get_mut(total..)
.ok_or_else(|| Error::new(ErrorKind::InvalidInput, "read_at slice out of range"))?;
let at = offset
.checked_add(total as u64)
.ok_or_else(|| Error::new(ErrorKind::InvalidInput, "read_at offset overflow"))?;
let n = loop {
match self.ring.lock().read_at(self.fd, remaining, at) {
Ok(n) => break n,
Err(e) if e.kind() == ErrorKind::Interrupted => {}
Err(e) => return Err(e),
}
};
if n == 0 {
break;
}
total += n;
}
Ok(total)
}
fn lock_exclusive(&self) -> crate::io::Result<()> {
flock_exclusive_raw(self.fd, false)?;
Ok(())
}
fn try_lock_exclusive(&self) -> crate::io::Result<bool> {
flock_exclusive_raw(self.fd, true)
}
}
#[cfg(feature = "std")]
impl std::io::Read for IoUringRawFile {
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
self.read_impl(buf).map_err(Into::into)
}
}
#[cfg(not(feature = "std"))]
impl crate::io::Read for IoUringRawFile {
fn read(&mut self, buf: &mut [u8]) -> crate::io::Result<usize> {
self.read_impl(buf)
}
}
#[cfg(feature = "std")]
impl std::io::Write for IoUringRawFile {
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
self.write_impl(buf).map_err(Into::into)
}
fn flush(&mut self) -> std::io::Result<()> {
Ok(())
}
}
#[cfg(not(feature = "std"))]
impl crate::io::Write for IoUringRawFile {
fn write(&mut self, buf: &[u8]) -> crate::io::Result<usize> {
self.write_impl(buf)
}
fn flush(&mut self) -> crate::io::Result<()> {
Ok(())
}
}
#[cfg(feature = "std")]
impl std::io::Seek for IoUringRawFile {
fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result<u64> {
self.seek_impl(pos.into()).map_err(Into::into)
}
}
#[cfg(not(feature = "std"))]
impl crate::io::Seek for IoUringRawFile {
fn seek(&mut self, pos: SeekFrom) -> crate::io::Result<u64> {
self.seek_impl(pos)
}
}
fn path_to_cstring(path: &Path) -> Result<CString, Error> {
let s = path
.to_str()
.ok_or_else(|| Error::new(ErrorKind::InvalidInput, "path is not valid UTF-8"))?;
CString::new(s.as_bytes())
.map_err(|_| Error::new(ErrorKind::InvalidInput, "path contains an interior NUL"))
}
fn open_flags(opts: &FsOpenOptions) -> i32 {
let mut flags = if opts.read && (opts.write || opts.append) {
O_RDWR
} else if opts.write || opts.append {
O_WRONLY
} else {
O_RDONLY
};
if opts.create || opts.create_new {
flags |= O_CREAT;
}
if opts.create_new {
flags |= O_EXCL;
}
if opts.truncate {
flags |= O_TRUNC;
}
if opts.append {
flags |= O_APPEND;
}
flags
}
fn read_dir_entries(fd: i32) -> Result<Vec<(String, bool)>, Error> {
let mut entries = Vec::new();
let mut buf = [0u8; 4096];
loop {
let n = unsafe {
syscall3(
Sysno::getdents64,
fd as usize,
buf.as_mut_ptr() as usize,
buf.len(),
)
}
.map_err(|e| err("getdents64", e))?;
if n == 0 {
break; }
let n = n as usize;
let mut off = 0usize;
while off + 19 <= n {
let reclen = {
let b: [u8; 2] = buf
.get(off + 16..off + 18)
.and_then(|s| s.try_into().ok())
.ok_or_else(|| {
Error::new(ErrorKind::InvalidData, "dirent reclen out of range")
})?;
usize::from(u16::from_ne_bytes(b))
};
if reclen < 19 || off + reclen > n {
return Err(Error::new(
ErrorKind::InvalidData,
"dirent record length invalid",
));
}
let d_type = *buf
.get(off + 18)
.ok_or_else(|| Error::new(ErrorKind::InvalidData, "dirent type out of range"))?;
let name_region = buf
.get(off + 19..off + reclen)
.ok_or_else(|| Error::new(ErrorKind::InvalidData, "dirent name out of range"))?;
let name_len = name_region
.iter()
.position(|&c| c == 0)
.unwrap_or(name_region.len());
let name_bytes = name_region
.get(..name_len)
.ok_or_else(|| Error::new(ErrorKind::InvalidData, "dirent name slice"))?;
let name = core::str::from_utf8(name_bytes)
.map_err(|_| Error::new(ErrorKind::InvalidData, "dirent name is not UTF-8"))?;
if name != "." && name != ".." {
entries.push((name.to_string(), d_type == DT_DIR));
}
off += reclen;
}
}
Ok(entries)
}
pub struct IoUringRawFs {
ring: Arc<Mutex<IoUringRaw>>,
}
impl IoUringRawFs {
pub fn new(ring_entries: u32) -> Result<Self, Error> {
Ok(Self {
ring: Arc::new(Mutex::new(IoUringRaw::new(ring_entries)?)),
})
}
}
impl Fs for IoUringRawFs {
fn open(&self, path: &Path, opts: &FsOpenOptions) -> crate::io::Result<Box<dyn FsFile>> {
let cpath = path_to_cstring(path)?;
let fd = open_raw(&cpath, open_flags(opts), 0o644)?;
Ok(Box::new(IoUringRawFile::new(
Arc::clone(&self.ring),
fd,
opts.append,
)))
}
fn create_dir_all(&self, path: &Path) -> crate::io::Result<()> {
if let Some(parent) = path.parent()
&& !parent.to_str().unwrap_or("").is_empty()
{
self.create_dir_all(parent)?;
}
match self.create_dir(path) {
Ok(()) => Ok(()),
Err(e) if e.kind() == ErrorKind::AlreadyExists => Ok(()),
Err(e) => Err(e),
}
}
fn create_dir(&self, path: &Path) -> crate::io::Result<()> {
mkdirat_raw(&path_to_cstring(path)?, 0o755)
}
fn read_dir(&self, path: &Path) -> crate::io::Result<Vec<FsDirEntry>> {
let cpath = path_to_cstring(path)?;
let fd = open_raw(&cpath, O_RDONLY | O_DIRECTORY, 0)?;
let result = read_dir_entries(fd);
let _ = close_raw(fd);
let names = result?;
Ok(names
.into_iter()
.map(|(name, is_dir)| FsDirEntry {
path: path.join(&name),
file_name: name,
is_dir,
})
.collect())
}
fn remove_file(&self, path: &Path) -> crate::io::Result<()> {
unlinkat_raw(&path_to_cstring(path)?, false)
}
fn remove_dir_all(&self, path: &Path) -> crate::io::Result<()> {
for entry in self.read_dir(path)? {
let child = path.join(&entry.file_name);
if entry.is_dir {
self.remove_dir_all(&child)?;
} else {
self.remove_file(&child)?;
}
}
unlinkat_raw(&path_to_cstring(path)?, true)
}
fn rename(&self, from: &Path, to: &Path) -> crate::io::Result<()> {
renameat2_raw(&path_to_cstring(from)?, &path_to_cstring(to)?)
}
fn metadata(&self, path: &Path) -> crate::io::Result<FsMetadata> {
match statx_path_raw(&path_to_cstring(path)?)? {
Some(m) => Ok(FsMetadata {
len: m.size,
is_dir: m.is_dir,
is_file: m.is_file,
}),
None => Err(Error::new(ErrorKind::NotFound, "path not found")),
}
}
#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
fn available_space(&self, path: &Path) -> crate::io::Result<u64> {
statfs_available_raw(&path_to_cstring(path)?)
}
fn sync_directory(&self, path: &Path) -> crate::io::Result<()> {
let cpath = path_to_cstring(path)?;
let fd = open_raw(&cpath, O_RDONLY | O_DIRECTORY, 0)?;
let r = self.ring.lock().fsync(fd, false);
let _ = close_raw(fd);
r
}
fn exists(&self, path: &Path) -> crate::io::Result<bool> {
Ok(statx_path_raw(&path_to_cstring(path)?)?.is_some())
}
}
pub struct IoUringRaw {
ring_fd: i32,
sq_ptr: *mut u8,
sq_len: usize,
cq_ptr: *mut u8,
cq_len: usize,
sqes: *mut IoUringSqe,
sqes_len: usize,
sq_entries: u32,
sq_khead: *const u32,
sq_ktail: *mut u32,
sq_ring_mask: u32,
sq_array: *mut u32,
cq_khead: *mut u32,
cq_ktail: *const u32,
cq_ring_mask: u32,
cqes: *const IoUringCqe,
}
impl IoUringRaw {
#[cfg_attr(coverage_nightly, coverage(off))]
pub fn new(entries: u32) -> Result<Self, Error> {
let mut params = IoUringParams::default();
let ring_fd = unsafe { io_uring_setup(entries, &raw mut params) }?;
let guard = FdGuard(ring_fd);
let sq_entries = params.sq_entries;
let single_mmap = params.features & IORING_FEAT_SINGLE_MMAP != 0;
let sq_ring_sz = params.sq_off.array as usize + sq_entries as usize * size_of::<u32>();
let cq_ring_sz =
params.cq_off.cqes as usize + params.cq_entries as usize * size_of::<IoUringCqe>();
let sq_len = if single_mmap {
sq_ring_sz.max(cq_ring_sz)
} else {
sq_ring_sz
};
let sq_ptr = unsafe {
mmap(
sq_len,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE,
ring_fd,
IORING_OFF_SQ_RING,
)
}?;
let (cq_ptr, cq_len, cq_base) = if single_mmap {
(core::ptr::null_mut(), 0usize, sq_ptr)
} else {
let p = match unsafe {
mmap(
cq_ring_sz,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE,
ring_fd,
IORING_OFF_CQ_RING,
)
} {
Ok(p) => p,
Err(e) => {
unsafe { munmap(sq_ptr, sq_len) };
return Err(e);
}
};
(p, cq_ring_sz, p)
};
let sqes = match unsafe {
mmap(
sq_entries as usize * size_of::<IoUringSqe>(),
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE,
ring_fd,
IORING_OFF_SQES,
)
} {
Ok(p) => p.cast::<IoUringSqe>(),
Err(e) => {
unsafe {
munmap(sq_ptr, sq_len);
if !cq_ptr.is_null() {
munmap(cq_ptr, cq_len);
}
}
return Err(e);
}
};
guard.disarm();
let sqes_len = sq_entries as usize * size_of::<IoUringSqe>();
Ok(unsafe {
Self {
ring_fd,
sq_ptr,
sq_len,
cq_ptr,
cq_len,
sqes,
sqes_len,
sq_entries,
sq_khead: sq_ptr.add(params.sq_off.head as usize).cast(),
sq_ktail: sq_ptr.add(params.sq_off.tail as usize).cast(),
sq_ring_mask: *(sq_ptr.add(params.sq_off.ring_mask as usize).cast::<u32>()),
sq_array: sq_ptr.add(params.sq_off.array as usize).cast(),
cq_khead: cq_base.add(params.cq_off.head as usize).cast(),
cq_ktail: cq_base.add(params.cq_off.tail as usize).cast(),
cq_ring_mask: *(cq_base.add(params.cq_off.ring_mask as usize).cast::<u32>()),
cqes: cq_base.add(params.cq_off.cqes as usize).cast(),
}
})
}
pub fn nop(&mut self, user_data: u64) -> Result<i32, Error> {
let sqe = IoUringSqe {
opcode: IORING_OP_NOP,
user_data,
..IoUringSqe::default()
};
self.submit_and_reap_one(&sqe)
}
pub fn read_at(&mut self, fd: i32, buf: &mut [u8], offset: u64) -> Result<usize, Error> {
let len = u32::try_from(buf.len()).map_err(|_| {
Error::new(
ErrorKind::InvalidInput,
"read length exceeds the 4 GiB io_uring single-op limit",
)
})?;
let sqe = IoUringSqe {
opcode: IORING_OP_READ,
fd,
addr: buf.as_mut_ptr() as u64,
len,
off: offset,
..IoUringSqe::default()
};
let res = self.submit_and_reap_one(&sqe)?;
Ok(res as usize)
}
pub fn write_at(&mut self, fd: i32, buf: &[u8], offset: u64) -> Result<usize, Error> {
let len = u32::try_from(buf.len()).map_err(|_| {
Error::new(
ErrorKind::InvalidInput,
"write length exceeds the 4 GiB io_uring single-op limit",
)
})?;
let sqe = IoUringSqe {
opcode: IORING_OP_WRITE,
fd,
addr: buf.as_ptr() as u64,
len,
off: offset,
..IoUringSqe::default()
};
let res = self.submit_and_reap_one(&sqe)?;
Ok(res as usize)
}
pub fn fsync(&mut self, fd: i32, datasync: bool) -> Result<(), Error> {
let sqe = IoUringSqe {
opcode: IORING_OP_FSYNC,
fd,
op_flags: if datasync { IORING_FSYNC_DATASYNC } else { 0 },
..IoUringSqe::default()
};
self.submit_and_reap_one(&sqe)?;
Ok(())
}
#[cfg_attr(coverage_nightly, coverage(off))]
fn submit_and_reap_one(&mut self, sqe: &IoUringSqe) -> Result<i32, Error> {
unsafe {
let tail = core::ptr::read_volatile(self.sq_ktail);
let head = core::ptr::read_volatile(self.sq_khead);
fence(Ordering::Acquire);
if tail.wrapping_sub(head) >= self.sq_entries {
return Err(Error::new(
ErrorKind::Other,
"io_uring submission queue is full",
));
}
let index = tail & self.sq_ring_mask;
core::ptr::write(self.sqes.add(index as usize), *sqe);
core::ptr::write(self.sq_array.add(index as usize), index);
fence(Ordering::Release);
core::ptr::write_volatile(self.sq_ktail, tail.wrapping_add(1));
}
loop {
match unsafe { io_uring_enter(self.ring_fd, 1, 1, IORING_ENTER_GETEVENTS) } {
Ok(_) => break,
Err(e) if e.kind() == ErrorKind::Interrupted => {}
Err(e) => return Err(e),
}
}
let res = unsafe {
let head = core::ptr::read_volatile(self.cq_khead);
let ktail = core::ptr::read_volatile(self.cq_ktail);
fence(Ordering::Acquire);
if head == ktail {
return Err(Error::new(
ErrorKind::Other,
"io_uring_enter returned with no completion",
));
}
let cqe = core::ptr::read(self.cqes.add((head & self.cq_ring_mask) as usize));
fence(Ordering::Release);
core::ptr::write_volatile(self.cq_khead, head.wrapping_add(1));
cqe.res
};
if res < 0 {
return Err(Error::new(
errno_to_kind(-res),
format!("io_uring op completed with errno {}", -res),
));
}
Ok(res)
}
#[must_use]
pub fn sq_entries(&self) -> u32 {
self.sq_entries
}
}
impl Drop for IoUringRaw {
fn drop(&mut self) {
unsafe {
munmap(self.sqes.cast(), self.sqes_len);
if !self.cq_ptr.is_null() {
munmap(self.cq_ptr, self.cq_len);
}
munmap(self.sq_ptr, self.sq_len);
close(self.ring_fd);
}
}
}
struct FdGuard(i32);
impl FdGuard {
fn disarm(self) {
core::mem::forget(self);
}
}
impl Drop for FdGuard {
#[cfg_attr(coverage_nightly, coverage(off))]
fn drop(&mut self) {
unsafe { close(self.0) };
}
}
#[cfg(test)]
#[expect(clippy::expect_used, reason = "test code")]
mod tests {
use super::*;
#[test]
fn raw_file_fsfile_round_trips() {
use std::io::{Read, Seek, Write};
let tmp = tempfile::tempdir().expect("tempdir");
let path = tmp.path().join("iou_rawfile.bin");
let cpath = std::ffi::CString::new(path.to_str().expect("utf8 path"))
.expect("path has no interior NUL");
let fd =
open_raw(&cpath, O_CREAT | O_RDWR | O_TRUNC, 0o600).expect("openat should succeed");
let ring = Arc::new(Mutex::new(IoUringRaw::new(8).expect("ring setup")));
let mut file = IoUringRawFile::new(ring, fd, false);
let payload = b"raw io_uring file round-trip payload";
let written = Write::write(&mut file, payload).expect("write");
assert_eq!(written, payload.len(), "short write");
assert_eq!(
FsFile::metadata(&file).expect("metadata").len,
payload.len() as u64,
"metadata len must match what was written"
);
let mut rb = vec![0u8; payload.len()];
let n = FsFile::read_at(&file, &mut rb, 0).expect("read_at");
assert_eq!(n, payload.len(), "short read_at");
assert_eq!(&rb, payload, "read_at bytes must match");
file.seek(std::io::SeekFrom::Start(4)).expect("seek");
let mut tail = vec![0u8; payload.len() - 4];
let mut got = 0;
while got < tail.len() {
let chunk = tail.get_mut(got..).expect("in-bounds");
let r = Read::read(&mut file, chunk).expect("read");
if r == 0 {
break;
}
got += r;
}
assert_eq!(
tail.get(..got).expect("in-bounds"),
payload.get(4..).expect("in-bounds"),
"seek+read bytes must match"
);
FsFile::sync_all(&file).expect("sync_all");
FsFile::sync_data(&file).expect("sync_data");
FsFile::set_len(&file, 4).expect("set_len");
assert_eq!(
FsFile::metadata(&file).expect("metadata after set_len").len,
4,
"set_len must shrink the file"
);
assert!(
FsFile::try_lock_exclusive(&file).expect("try_lock_exclusive"),
"exclusive lock on a fresh file must succeed"
);
drop(file); }
#[test]
fn raw_fs_directory_and_file_ops() {
use crate::fs::Fs;
use crate::path::Path;
use std::io::Write;
let tmp = tempfile::tempdir().expect("tempdir");
let base = tmp.path().join("rawfs");
let base_s = base.to_str().expect("utf8 temp path");
let fs = IoUringRawFs::new(8).expect("fs setup");
let dir = Path::new(base_s);
fs.create_dir_all(dir).expect("create_dir_all");
assert!(fs.exists(dir).expect("dir exists"));
assert!(fs.metadata(dir).expect("dir metadata").is_dir);
let fpath = dir.join("a.txt");
let mut file = fs
.open(&fpath, &FsOpenOptions::new().write(true).create(true))
.expect("open");
file.write_all(b"hello").expect("write");
file.sync_all().expect("sync");
drop(file);
assert_eq!(fs.metadata(&fpath).expect("file metadata").len, 5);
let entries = fs.read_dir(dir).expect("read_dir");
assert!(
entries.iter().any(|e| e.file_name == "a.txt" && !e.is_dir),
"read_dir must list the written file"
);
let fpath2 = dir.join("b.txt");
fs.rename(&fpath, &fpath2).expect("rename");
assert!(!fs.exists(&fpath).expect("old gone"));
assert!(fs.exists(&fpath2).expect("new present"));
fs.remove_file(&fpath2).expect("remove_file");
assert!(!fs.exists(&fpath2).expect("file gone"));
fs.remove_dir_all(dir).expect("remove_dir_all");
assert!(!fs.exists(dir).expect("dir gone"));
}
#[test]
fn raw_file_append_writes_accumulate_at_eof() {
use crate::fs::Fs;
use crate::path::Path;
use std::io::Write;
let tmp = tempfile::tempdir().expect("tempdir");
let p = tmp.path().join("log.bin");
let ps = p.to_str().expect("utf8 temp path");
let fs = IoUringRawFs::new(8).expect("fs setup");
let path = Path::new(ps);
{
let mut f = fs
.open(path, &FsOpenOptions::new().append(true).create(true))
.expect("open append");
f.write_all(b"aaa").expect("first append");
f.sync_all().expect("sync");
}
{
let mut f = fs
.open(path, &FsOpenOptions::new().append(true).create(true))
.expect("reopen append");
f.write_all(b"bbb").expect("second append");
f.sync_all().expect("sync");
}
let f = fs
.open(path, &FsOpenOptions::new().read(true))
.expect("open read");
let mut buf = [0u8; 6];
let n = f.read_at(&mut buf, 0).expect("read_at");
assert_eq!(n, 6, "both appends must be present");
assert_eq!(&buf, b"aaabbb", "appends land in order at EOF");
}
#[test]
fn ring_setup_and_nop_round_trips() {
let mut ring = IoUringRaw::new(8).expect("io_uring_setup + mmap should succeed");
assert!(
ring.sq_entries() >= 8,
"kernel rounds entries up to >= request"
);
let res = ring
.nop(0xDEAD_BEEF)
.expect("NOP submit/complete should succeed");
assert_eq!(res, 0, "NOP res must be 0");
}
#[test]
fn multiple_nops_reuse_slots() {
let mut ring = IoUringRaw::new(4).expect("setup");
for i in 0..16u64 {
let res = ring.nop(i).expect("nop");
assert_eq!(res, 0);
}
}
#[test]
fn file_write_fsync_read_round_trips_through_the_ring() {
let path = std::env::temp_dir().join(format!(
"iou_raw_rt_{}_{}.bin",
std::process::id(),
line!()
));
let cpath = std::ffi::CString::new(path.to_str().expect("utf8 path"))
.expect("path has no interior NUL");
let fd =
open_raw(&cpath, O_CREAT | O_RDWR | O_TRUNC, 0o600).expect("openat should succeed");
let mut ring = IoUringRaw::new(8).expect("ring setup");
let payload = b"io_uring raw round-trip payload";
let offset = 4096u64;
let written = ring
.write_at(fd, payload, offset)
.expect("write_at should succeed");
assert_eq!(written, payload.len(), "short write");
ring.fsync(fd, false).expect("fsync should succeed");
let mut readback = vec![0u8; payload.len()];
let read = ring
.read_at(fd, &mut readback, offset)
.expect("read_at should succeed");
assert_eq!(read, payload.len(), "short read");
assert_eq!(
&readback, payload,
"read-back bytes must match what we wrote"
);
let mut tail = [0u8; 8];
let eof = ring
.read_at(fd, &mut tail, offset + payload.len() as u64)
.expect("read at EOF should succeed");
assert_eq!(eof, 0, "read past EOF must return 0");
close_raw(fd).expect("close should succeed");
let _ = std::fs::remove_file(&path);
}
#[test]
fn errno_maps_to_expected_error_kinds() {
assert_eq!(errno_to_kind(1), ErrorKind::PermissionDenied); assert_eq!(errno_to_kind(2), ErrorKind::NotFound); assert_eq!(errno_to_kind(4), ErrorKind::Interrupted); assert_eq!(errno_to_kind(9), ErrorKind::InvalidInput); assert_eq!(errno_to_kind(11), ErrorKind::WouldBlock); assert_eq!(errno_to_kind(13), ErrorKind::PermissionDenied); assert_eq!(errno_to_kind(17), ErrorKind::AlreadyExists); assert_eq!(errno_to_kind(22), ErrorKind::InvalidInput); assert_eq!(errno_to_kind(95), ErrorKind::Unsupported); assert_eq!(errno_to_kind(132), ErrorKind::Other); }
#[test]
fn ring_setup_with_zero_entries_is_rejected() {
match IoUringRaw::new(0) {
Ok(_) => panic!("zero-entry ring must be rejected"),
Err(e) => assert_eq!(e.kind(), ErrorKind::InvalidInput),
}
}
#[test]
fn open_raw_missing_file_returns_not_found() {
let cpath = std::ffi::CString::new("/proc/does-not-exist/iou_raw_missing")
.expect("no interior NUL");
let err = open_raw(&cpath, O_RDONLY, 0).expect_err("missing file must fail to open");
assert_eq!(err.kind(), ErrorKind::NotFound);
}
#[test]
fn ring_op_on_bad_fd_surfaces_completion_error() {
let mut ring = IoUringRaw::new(4).expect("ring setup");
let err = ring
.write_at(1 << 30, b"x", 0)
.expect_err("write to a non-open fd must fail");
assert_eq!(err.kind(), ErrorKind::InvalidInput); }
#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
#[test]
fn raw_available_space_reports_plausible_free_bytes() {
use crate::fs::Fs;
use crate::path::Path;
let tmp = tempfile::tempdir().expect("tempdir");
let fs = IoUringRawFs::new(8).expect("fs setup");
let free = fs
.available_space(Path::new(tmp.path().to_str().expect("utf8 path")))
.expect("statfs must succeed on a real filesystem");
assert!(
free > 0,
"a writable tempdir filesystem must report free space"
);
assert!(
free < u64::MAX,
"a real probe must not return the unbounded sentinel"
);
}
#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
#[test]
fn raw_statfs_on_missing_path_errors() {
let cpath =
std::ffi::CString::new("/proc/does-not-exist/iou_raw_statfs").expect("no interior NUL");
assert!(statfs_available_raw(&cpath).is_err());
}
}