kevy_uring/ffi.rs
1//! Raw extern "C" declarations for the syscalls the engine needs (`mmap`,
2//! `munmap`, `close`, `syscall`) plus the io_uring kernel ABI constants
3//! everything else in the crate references.
4
5use core::ffi::{c_int, c_long, c_void};
6
7unsafe extern "C" {
8 pub fn mmap(
9 addr: *mut c_void,
10 len: usize,
11 prot: c_int,
12 flags: c_int,
13 fd: c_int,
14 off: i64,
15 ) -> *mut c_void;
16 pub fn munmap(addr: *mut c_void, len: usize) -> c_int;
17 pub fn close(fd: c_int) -> c_int;
18 /// Raw syscall: io_uring has no glibc wrapper. Variadic in C.
19 pub fn syscall(num: c_long, ...) -> c_long;
20}
21
22// ---- io_uring syscall numbers — identical across Linux architectures ------
23
24pub const SYS_IO_URING_SETUP: c_long = 425;
25pub const SYS_IO_URING_ENTER: c_long = 426;
26pub const SYS_IO_URING_REGISTER: c_long = 427;
27
28// ---- mmap protection / flags ----------------------------------------------
29
30pub const PROT_READ: c_int = 0x1;
31pub const PROT_WRITE: c_int = 0x2;
32pub const MAP_SHARED: c_int = 0x1;
33pub const MAP_PRIVATE: c_int = 0x2;
34pub const MAP_ANONYMOUS: c_int = 0x20;
35pub const MAP_POPULATE: c_int = 0x8000;
36
37// ---- mmap region offsets (file-offset selectors for the three regions) ---
38
39pub const IORING_OFF_SQ_RING: i64 = 0;
40pub const IORING_OFF_CQ_RING: i64 = 0x0800_0000;
41pub const IORING_OFF_SQES: i64 = 0x1000_0000;
42
43// ---- io_uring_setup flags -------------------------------------------------
44
45/// Run the kernel-side submission poll thread (SQPOLL). With this flag set,
46/// the kernel polls the SQ from a dedicated kernel thread and does
47/// `io_uring_enter` becomes unnecessary on the steady state — submissions are
48/// reaped without a syscall.
49pub const IORING_SETUP_SQPOLL: u32 = 1 << 1;
50
51/// Pin the SQPOLL kernel thread to `sq_thread_cpu`. Requires `IORING_SETUP_SQPOLL`.
52pub const IORING_SETUP_SQ_AFF: u32 = 1 << 2;
53
54/// **Linux 5.19+**. Hint that all SQEs come from "cooperative task" context
55/// (the user thread is itself processing CQEs). Lets the kernel skip a
56/// `task_work_add`/IPI on the completion path. Free win when the same
57/// thread that calls `io_uring_enter` is the one that drains CQEs.
58pub const IORING_SETUP_COOP_TASKRUN: u32 = 1 << 8;
59
60/// **Linux 6.0+**. Declare that **only one thread** ever submits to this
61/// ring. Lets the kernel skip locking on the submission path. Safe for
62/// kevy's per-shard rings (one shard thread owns each ring exclusively).
63pub const IORING_SETUP_SINGLE_ISSUER: u32 = 1 << 12;
64
65/// **Linux 6.1+**. Defer all completion task_work to the user thread's
66/// `io_uring_enter` call instead of running it from an IPI. Pairs with
67/// `SINGLE_ISSUER` and slashes the cost of completion-side bookkeeping.
68/// Requires `SINGLE_ISSUER` set as well.
69///
70/// **Defined but not used in kevy** — see the E2 attack notes in
71/// `bench/PERF-ATTACK-LOG-2026-06-20.md`. The constant is kept in the
72/// ABI table for documentation + future single-threaded reactor callers.
73#[allow(dead_code)]
74pub const IORING_SETUP_DEFER_TASKRUN: u32 = 1 << 13;
75
76// ---- io_uring_enter flags -------------------------------------------------
77
78pub const IORING_ENTER_GETEVENTS: u32 = 1;
79
80/// Wake the SQPOLL kernel thread if it was parked. Userland must check the
81/// `IORING_SQ_NEED_WAKEUP` bit in the shared `sq_flags` and pass this flag
82/// to `io_uring_enter` whenever it is set.
83pub const IORING_ENTER_SQ_WAKEUP: u32 = 1 << 1;
84
85// ---- shared SQ ring flag bits ---------------------------------------------
86
87/// The SQPOLL kernel thread has parked itself (idle longer than
88/// `sq_thread_idle` ms). Userland MUST call `io_uring_enter` with
89/// `IORING_ENTER_SQ_WAKEUP` to re-arm it.
90pub const IORING_SQ_NEED_WAKEUP: u32 = 1 << 0;
91
92// ---- Operation opcodes (subset we use) ------------------------------------
93
94pub const IORING_OP_NOP: u8 = 0;
95pub const IORING_OP_TIMEOUT: u8 = 11;
96pub const IORING_OP_ACCEPT: u8 = 13;
97pub const IORING_OP_READ: u8 = 22;
98pub const IORING_OP_WRITEV: u8 = 2;
99pub const IORING_OP_WRITE: u8 = 23;
100
101/// POSIX `struct iovec` for `IORING_OP_WRITEV`. Matches the kernel
102/// layout (pointer + length). L1 (2026-06-21): the reactor's reply
103/// path submits an `&[Iovec]` so [bulk-header, value-bytes,
104/// trailing-CRLF] fuse into ONE syscall — skipping the per-GET
105/// memcpy of the value into the per-conn output Vec.
106///
107/// **Lifetime**: the kernel reads the iovec array AND each iovec's
108/// `base` slice asynchronously. The caller must keep both alive until
109/// the matching CQE fires (`uring_arm_conns` parks them in the conn's
110/// pending-writes state and drops on completion).
111#[repr(C)]
112#[derive(Clone, Copy)]
113pub struct Iovec {
114 /// Pointer to bytes.
115 pub iov_base: *const u8,
116 /// Number of bytes at `iov_base`.
117 pub iov_len: usize,
118}
119pub const IORING_OP_RECV: u8 = 27;
120
121// accept4 flags set on the accepted socket (carried in the SQE's accept_flags
122// field, which aliases `rw_flags`).
123pub const SOCK_NONBLOCK: u32 = 0x800;
124pub const SOCK_CLOEXEC: u32 = 0x8_0000;
125
126// ---- SQE flags / ioprio bits for buffer-select + multishot recv -----------
127
128pub const IOSQE_BUFFER_SELECT: u8 = 1 << 5; // SQE picks a buffer from a group
129pub const IORING_RECV_MULTISHOT: u16 = 2; // (ioprio) re-fire one recv per arrival
130/// **Linux 5.19+**. `(ioprio)` re-fire one accept per arriving connection.
131/// Kernel keeps the accept SQE armed across completions; each CQE carries
132/// the new fd in `res` and `IORING_CQE_F_MORE` in `flags` while still armed.
133/// When the kernel drops the multishot (listener closed, EAGAIN-like errors),
134/// `F_MORE` is clear and userland must re-submit. B4 (2026-06-20): cuts the
135/// one-SQE-per-accept overhead under high-conn-churn workloads.
136pub const IORING_ACCEPT_MULTISHOT: u16 = 1; // (ioprio bit for IORING_OP_ACCEPT)
137
138// ---- io_uring_register opcodes --------------------------------------------
139
140/// Defined for completeness — the registered files table is auto-released
141/// when the ring fd closes, so explicit unregister is unused.
142#[allow(dead_code)]
143pub const IORING_REGISTER_FILES: c_int = 2;
144#[allow(dead_code)]
145pub const IORING_UNREGISTER_FILES: c_int = 3;
146/// **Linux 5.13+**. Replace one slot's fd in a previously-registered files
147/// table. Caller passes a `struct io_uring_files_update` describing the
148/// slot index + fd; -1 in the fd field unmaps the slot.
149pub const IORING_REGISTER_FILES_UPDATE: c_int = 6;
150/// **Linux 5.13+**. Register an files table via the rsrc-struct API. Pair
151/// with `IORING_RSRC_REGISTER_SPARSE` in the struct's flags to allocate
152/// an empty table of `nr` slots without supplying initial fds.
153pub const IORING_REGISTER_FILES2: c_int = 13;
154
155pub const IORING_REGISTER_PBUF_RING: c_int = 22;
156pub const IORING_UNREGISTER_PBUF_RING: c_int = 23;
157
158/// **Linux 5.18+**. Register the ring's own fd into the user task's
159/// io_uring-registered-rings table. After registration, callers pass the
160/// returned index (with `IORING_ENTER_REGISTERED_RING` set) instead of
161/// the raw ring fd; the kernel skips `fget`/`fput` per `io_uring_enter`
162/// syscall — the largest visible kernel-side cost in kevy's perf-record
163/// (5.5% / 2.7% of -c1 CPU before this attack).
164pub const IORING_REGISTER_RING_FDS: c_int = 20;
165#[allow(dead_code)]
166pub const IORING_UNREGISTER_RING_FDS: c_int = 21;
167
168/// **Linux 5.18+**. Tells `io_uring_enter` that its `fd` argument is an
169/// index into the registered-rings table from
170/// [`IORING_REGISTER_RING_FDS`], not a raw fd.
171pub const IORING_ENTER_REGISTERED_RING: u32 = 1 << 4;
172
173// ---- SQE flags for fixed-file ops ----------------------------------------
174
175/// **Linux 5.1+**. Treat the SQE's `fd` field as an **index into the
176/// registered files table** (see [`IORING_REGISTER_FILES_SPARSE`]) instead
177/// of a real fd. The kernel skips the per-op `fget`/`fput` fd-table lookup
178/// — the largest single non-Spectre kernel cost in kevy's hot path
179/// (8 pp of -c1 CPU on the lx64 reference; see attack E1).
180pub const IOSQE_FIXED_FILE: u8 = 1 << 0;
181
182// ---- Completion `flags` bits ----------------------------------------------
183// A buffer was used (id in the top 16 bits) / the multishot SQE remains armed.
184
185pub const IORING_CQE_F_BUFFER: u32 = 1 << 0;
186pub const IORING_CQE_F_MORE: u32 = 1 << 1;
187pub const IORING_CQE_BUFFER_SHIFT: u32 = 16;
188
189// ---- Provided-buffer ring layout constants --------------------------------
190
191/// `sizeof(struct io_uring_buf)` — `{ addr:u64, len:u32, bid:u16, resv:u16 }`.
192pub const IO_URING_BUF_SIZE: usize = 16;
193/// Byte offset of the producer `tail` within the buf ring (it aliases
194/// `bufs[0].resv`, so adding a buffer at index 0 — which writes only addr/len/bid,
195/// offsets 0..14 — never clobbers it).
196pub const IO_URING_BUF_TAIL_OFF: usize = 14;