kevy_uring/ffi.rs
1//! Raw extern "C" declarations for the syscalls the engine needs (`mmap`,
2//! `munmap`, `close`, `syscall`) plus the io_uring kernel ABI constants
3//! everything else in the crate references.
4
5use core::ffi::{c_int, c_long, c_void};
6
7unsafe extern "C" {
8 pub fn mmap(
9 addr: *mut c_void,
10 len: usize,
11 prot: c_int,
12 flags: c_int,
13 fd: c_int,
14 off: i64,
15 ) -> *mut c_void;
16 pub fn munmap(addr: *mut c_void, len: usize) -> c_int;
17 pub fn close(fd: c_int) -> c_int;
18 /// Raw syscall: io_uring has no glibc wrapper. Variadic in C.
19 pub fn syscall(num: c_long, ...) -> c_long;
20}
21
22// ---- io_uring syscall numbers — identical across Linux architectures ------
23
24pub const SYS_IO_URING_SETUP: c_long = 425;
25pub const SYS_IO_URING_ENTER: c_long = 426;
26pub const SYS_IO_URING_REGISTER: c_long = 427;
27
28// ---- mmap protection / flags ----------------------------------------------
29
30pub const PROT_READ: c_int = 0x1;
31pub const PROT_WRITE: c_int = 0x2;
32pub const MAP_SHARED: c_int = 0x1;
33pub const MAP_PRIVATE: c_int = 0x2;
34pub const MAP_ANONYMOUS: c_int = 0x20;
35pub const MAP_POPULATE: c_int = 0x8000;
36
37// ---- mmap region offsets (file-offset selectors for the three regions) ---
38
39pub const IORING_OFF_SQ_RING: i64 = 0;
40pub const IORING_OFF_CQ_RING: i64 = 0x0800_0000;
41pub const IORING_OFF_SQES: i64 = 0x1000_0000;
42
43// ---- io_uring_setup flags -------------------------------------------------
44
45/// Run the kernel-side submission poll thread (SQPOLL). With this flag set,
46/// the kernel polls the SQ from a dedicated kernel thread and does
47/// `io_uring_enter` becomes unnecessary on the steady state — submissions are
48/// reaped without a syscall.
49pub const IORING_SETUP_SQPOLL: u32 = 1 << 1;
50
51/// Pin the SQPOLL kernel thread to `sq_thread_cpu`. Requires `IORING_SETUP_SQPOLL`.
52pub const IORING_SETUP_SQ_AFF: u32 = 1 << 2;
53
54/// **Linux 5.19+**. Hint that all SQEs come from "cooperative task" context
55/// (the user thread is itself processing CQEs). Lets the kernel skip a
56/// `task_work_add`/IPI on the completion path. Free win when the same
57/// thread that calls `io_uring_enter` is the one that drains CQEs.
58pub const IORING_SETUP_COOP_TASKRUN: u32 = 1 << 8;
59
60/// **Linux 6.0+**. Declare that **only one thread** ever submits to this
61/// ring. Lets the kernel skip locking on the submission path. Safe for
62/// kevy's per-shard rings (one shard thread owns each ring exclusively).
63pub const IORING_SETUP_SINGLE_ISSUER: u32 = 1 << 12;
64
65/// **Linux 6.1+**. Defer all completion task_work to the user thread's
66/// `io_uring_enter` call instead of running it from an IPI. Pairs with
67/// `SINGLE_ISSUER` and slashes the cost of completion-side bookkeeping.
68/// Requires `SINGLE_ISSUER` set as well.
69///
70/// **Defined but not used in kevy** — see the E2 attack notes in
71/// `bench/PERF-ATTACK-LOG-2026-06-20.md`. The constant is kept in the
72/// ABI table for documentation + future single-threaded reactor callers.
73#[allow(dead_code)]
74pub const IORING_SETUP_DEFER_TASKRUN: u32 = 1 << 13;
75
76// ---- io_uring_enter flags -------------------------------------------------
77
78pub const IORING_ENTER_GETEVENTS: u32 = 1;
79
80/// Wake the SQPOLL kernel thread if it was parked. Userland must check the
81/// `IORING_SQ_NEED_WAKEUP` bit in the shared `sq_flags` and pass this flag
82/// to `io_uring_enter` whenever it is set.
83pub const IORING_ENTER_SQ_WAKEUP: u32 = 1 << 1;
84
85// ---- shared SQ ring flag bits ---------------------------------------------
86
87/// The SQPOLL kernel thread has parked itself (idle longer than
88/// `sq_thread_idle` ms). Userland MUST call `io_uring_enter` with
89/// `IORING_ENTER_SQ_WAKEUP` to re-arm it.
90pub const IORING_SQ_NEED_WAKEUP: u32 = 1 << 0;
91
92// ---- Operation opcodes (subset we use) ------------------------------------
93
94pub const IORING_OP_NOP: u8 = 0;
95pub const IORING_OP_TIMEOUT: u8 = 11;
96pub const IORING_OP_ACCEPT: u8 = 13;
97/// `IORING_OP_ASYNC_CANCEL` — cancel a previously-armed SQE. The SQE's
98/// `addr` field carries the `user_data` of the target SQE. The kernel
99/// emits two CQEs: one for the cancel itself (`res = 0` on success,
100/// `-ENOENT` if no matching SQE found, `-EALREADY` if target already
101/// started executing) and one `-ECANCELED` for the target SQE. v1.29
102/// B2-alt uses this to cancel an in-flight multishot recv before
103/// switching the conn to single-shot `prep_read` for big-arg ingest.
104pub const IORING_OP_ASYNC_CANCEL: u8 = 14;
105pub const IORING_OP_READ: u8 = 22;
106pub const IORING_OP_WRITEV: u8 = 2;
107pub const IORING_OP_WRITE: u8 = 23;
108
109/// POSIX `struct iovec` for `IORING_OP_WRITEV`. Matches the kernel
110/// layout (pointer + length). L1 (2026-06-21): the reactor's reply
111/// path submits an `&[Iovec]` so [bulk-header, value-bytes,
112/// trailing-CRLF] fuse into ONE syscall — skipping the per-GET
113/// memcpy of the value into the per-conn output Vec.
114///
115/// **Lifetime**: the kernel reads the iovec array AND each iovec's
116/// `base` slice asynchronously. The caller must keep both alive until
117/// the matching CQE fires (`uring_arm_conns` parks them in the conn's
118/// pending-writes state and drops on completion).
119#[repr(C)]
120#[derive(Clone, Copy)]
121pub struct Iovec {
122 /// Pointer to bytes.
123 pub iov_base: *const u8,
124 /// Number of bytes at `iov_base`.
125 pub iov_len: usize,
126}
127pub const IORING_OP_RECV: u8 = 27;
128
129// accept4 flags set on the accepted socket (carried in the SQE's accept_flags
130// field, which aliases `rw_flags`).
131pub const SOCK_NONBLOCK: u32 = 0x800;
132pub const SOCK_CLOEXEC: u32 = 0x8_0000;
133
134// ---- SQE flags / ioprio bits for buffer-select + multishot recv -----------
135
136pub const IOSQE_BUFFER_SELECT: u8 = 1 << 5; // SQE picks a buffer from a group
137pub const IORING_RECV_MULTISHOT: u16 = 2; // (ioprio) re-fire one recv per arrival
138/// **Linux 5.19+**. `(ioprio)` re-fire one accept per arriving connection.
139/// Kernel keeps the accept SQE armed across completions; each CQE carries
140/// the new fd in `res` and `IORING_CQE_F_MORE` in `flags` while still armed.
141/// When the kernel drops the multishot (listener closed, EAGAIN-like errors),
142/// `F_MORE` is clear and userland must re-submit. B4 (2026-06-20): cuts the
143/// one-SQE-per-accept overhead under high-conn-churn workloads.
144pub const IORING_ACCEPT_MULTISHOT: u16 = 1; // (ioprio bit for IORING_OP_ACCEPT)
145
146// ---- io_uring_register opcodes --------------------------------------------
147
148/// Defined for completeness — the registered files table is auto-released
149/// when the ring fd closes, so explicit unregister is unused.
150#[allow(dead_code)]
151pub const IORING_REGISTER_FILES: c_int = 2;
152#[allow(dead_code)]
153pub const IORING_UNREGISTER_FILES: c_int = 3;
154/// **Linux 5.13+**. Replace one slot's fd in a previously-registered files
155/// table. Caller passes a `struct io_uring_files_update` describing the
156/// slot index + fd; -1 in the fd field unmaps the slot.
157pub const IORING_REGISTER_FILES_UPDATE: c_int = 6;
158/// **Linux 5.13+**. Register an files table via the rsrc-struct API. Pair
159/// with `IORING_RSRC_REGISTER_SPARSE` in the struct's flags to allocate
160/// an empty table of `nr` slots without supplying initial fds.
161pub const IORING_REGISTER_FILES2: c_int = 13;
162
163pub const IORING_REGISTER_PBUF_RING: c_int = 22;
164pub const IORING_UNREGISTER_PBUF_RING: c_int = 23;
165
166/// **Linux 5.18+**. Register the ring's own fd into the user task's
167/// io_uring-registered-rings table. After registration, callers pass the
168/// returned index (with `IORING_ENTER_REGISTERED_RING` set) instead of
169/// the raw ring fd; the kernel skips `fget`/`fput` per `io_uring_enter`
170/// syscall — the largest visible kernel-side cost in kevy's perf-record
171/// (5.5% / 2.7% of -c1 CPU before this attack).
172pub const IORING_REGISTER_RING_FDS: c_int = 20;
173#[allow(dead_code)]
174pub const IORING_UNREGISTER_RING_FDS: c_int = 21;
175
176/// **Linux 5.18+**. Tells `io_uring_enter` that its `fd` argument is an
177/// index into the registered-rings table from
178/// [`IORING_REGISTER_RING_FDS`], not a raw fd.
179pub const IORING_ENTER_REGISTERED_RING: u32 = 1 << 4;
180
181// ---- SQE flags for fixed-file ops ----------------------------------------
182
183/// **Linux 5.1+**. Treat the SQE's `fd` field as an **index into the
184/// registered files table** (see [`IORING_REGISTER_FILES_SPARSE`]) instead
185/// of a real fd. The kernel skips the per-op `fget`/`fput` fd-table lookup
186/// — the largest single non-Spectre kernel cost in kevy's hot path
187/// (8 pp of -c1 CPU on the lx64 reference; see attack E1).
188pub const IOSQE_FIXED_FILE: u8 = 1 << 0;
189
190// ---- Completion `flags` bits ----------------------------------------------
191// A buffer was used (id in the top 16 bits) / the multishot SQE remains armed.
192
193pub const IORING_CQE_F_BUFFER: u32 = 1 << 0;
194pub const IORING_CQE_F_MORE: u32 = 1 << 1;
195pub const IORING_CQE_BUFFER_SHIFT: u32 = 16;
196
197// ---- Provided-buffer ring layout constants --------------------------------
198
199/// `sizeof(struct io_uring_buf)` — `{ addr:u64, len:u32, bid:u16, resv:u16 }`.
200pub const IO_URING_BUF_SIZE: usize = 16;
201/// Byte offset of the producer `tail` within the buf ring (it aliases
202/// `bufs[0].resv`, so adding a buffer at index 0 — which writes only addr/len/bid,
203/// offsets 0..14 — never clobbers it).
204pub const IO_URING_BUF_TAIL_OFF: usize = 14;