io_uring/submit.rs
1use std::os::unix::io::{AsRawFd, RawFd};
2use std::sync::atomic;
3use std::{io, mem, ptr};
4
5use crate::register::{execute, Probe};
6use crate::sys;
7use crate::types::{CancelBuilder, CloneBuffersFlags, Napi, Timespec};
8use crate::util::{cast_ptr, OwnedFd};
9use crate::Parameters;
10use bitflags::bitflags;
11
12use crate::register::Restriction;
13
14use crate::types;
15
16bitflags!(
17 /// See man page for complete description:
18 /// https://man7.org/linux/man-pages/man2/io_uring_enter.2.html
19 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
20 pub struct EnterFlags: u32 {
21 /// Wait for at least `min_complete` events to complete.
22 const GETEVENTS = sys::IORING_ENTER_GETEVENTS;
23
24 /// If the kernel thread is sleeping, wake it up.
25 const SQ_WAKEUP = sys::IORING_ENTER_SQ_WAKEUP;
26
27 /// Wait for at least one submission queue entry to be available.
28 const SQ_WAIT = sys::IORING_ENTER_SQ_WAIT;
29
30 /// Use the extended argument structure.
31 const EXT_ARG = sys::IORING_ENTER_EXT_ARG;
32
33 /// Submit using registered submission queue ring.
34 const REGISTERED_RING = sys::IORING_ENTER_REGISTERED_RING;
35
36 /// Timeout argument interpreted as absolute time.
37 const ABS_TIMER = sys::IORING_ENTER_ABS_TIMER;
38
39 /// Arg is offset into an area of wait regions previously registered.
40 const EXT_ARG_REG = sys::IORING_ENTER_EXT_ARG_REG;
41
42 /// Don't mark waiting task as being in iowait in certain cases.
43 const NO_IOWAIT = sys::IORING_ENTER_NO_IOWAIT;
44 }
45);
46
47/// Interface for submitting submission queue events in an io_uring instance to the kernel for
48/// executing and registering files or buffers with the instance.
49///
50/// io_uring supports both directly performing I/O on buffers and file descriptors and registering
51/// them beforehand. Registering is slow, but it makes performing the actual I/O much faster.
52pub struct Submitter<'a> {
53 fd: &'a OwnedFd,
54 params: &'a Parameters,
55
56 sq_head: *const atomic::AtomicU32,
57 sq_tail: *const atomic::AtomicU32,
58 sq_flags: *const atomic::AtomicU32,
59}
60
61impl<'a> Submitter<'a> {
62 #[inline]
63 pub(crate) const fn new(
64 fd: &'a OwnedFd,
65 params: &'a Parameters,
66 sq_head: *const atomic::AtomicU32,
67 sq_tail: *const atomic::AtomicU32,
68 sq_flags: *const atomic::AtomicU32,
69 ) -> Submitter<'a> {
70 Submitter {
71 fd,
72 params,
73 sq_head,
74 sq_tail,
75 sq_flags,
76 }
77 }
78
79 #[inline]
80 fn sq_len(&self) -> usize {
81 unsafe {
82 let head = (*self.sq_head).load(atomic::Ordering::Acquire);
83 let tail = (*self.sq_tail).load(atomic::Ordering::Acquire);
84
85 tail.wrapping_sub(head) as usize
86 }
87 }
88
89 /// Whether the kernel thread has gone to sleep because it waited for too long without
90 /// submission queue entries.
91 #[inline]
92 fn sq_need_wakeup(&self) -> bool {
93 unsafe {
94 (*self.sq_flags).load(atomic::Ordering::Relaxed) & sys::IORING_SQ_NEED_WAKEUP != 0
95 }
96 }
97
98 /// CQ ring is overflown
99 fn sq_cq_overflow(&self) -> bool {
100 unsafe {
101 (*self.sq_flags).load(atomic::Ordering::Relaxed) & sys::IORING_SQ_CQ_OVERFLOW != 0
102 }
103 }
104
105 /// Initiate and/or complete asynchronous I/O. This is a low-level wrapper around
106 /// `io_uring_enter` - see `man io_uring_enter` (or [its online
107 /// version](https://manpages.debian.org/unstable/liburing-dev/io_uring_enter.2.en.html) for
108 /// more details.
109 ///
110 /// You will probably want to use a more high-level API such as
111 /// [`submit`](Self::submit) or [`submit_and_wait`](Self::submit_and_wait).
112 ///
113 /// # Safety
114 ///
115 /// This provides a raw interface so developer must ensure that parameters are correct.
116 pub unsafe fn enter<T: Sized>(
117 &self,
118 to_submit: u32,
119 min_complete: u32,
120 flag: u32,
121 arg: Option<&T>,
122 ) -> io::Result<usize> {
123 let arg = arg
124 .map(|arg| cast_ptr(arg).cast())
125 .unwrap_or_else(ptr::null);
126 let size = mem::size_of::<T>();
127 sys::io_uring_enter(
128 self.fd.as_raw_fd(),
129 to_submit,
130 min_complete,
131 flag,
132 arg,
133 size,
134 )
135 .map(|res| res as _)
136 }
137
138 /// Submit all queued submission queue events to the kernel.
139 #[inline]
140 pub fn submit(&self) -> io::Result<usize> {
141 self.submit_and_wait(0)
142 }
143
144 /// Submit all queued submission queue events to the kernel and wait for at least `want`
145 /// completion events to complete.
146 pub fn submit_and_wait(&self, want: usize) -> io::Result<usize> {
147 let len = self.sq_len();
148 let mut flags = EnterFlags::empty();
149
150 // This logic suffers from the fact the sq_cq_overflow and sq_need_wakeup
151 // each cause an atomic load of the same variable, self.sq_flags.
152 // In the hottest paths, when a server is running with sqpoll,
153 // this is going to be hit twice, when once would be sufficient.
154 // However, consider that the `SeqCst` barrier required for interpreting
155 // the IORING_ENTER_SQ_WAKEUP bit is required in all paths where sqpoll
156 // is setup when consolidating the reads.
157
158 let sq_cq_overflow = self.sq_cq_overflow();
159
160 // When IORING_FEAT_NODROP is enabled and CQ overflows, the kernel buffers
161 // completion events internally but doesn't automatically flush them when
162 // CQ space becomes available. We must explicitly call io_uring_enter()
163 // to flush these buffered events, even with SQPOLL enabled.
164 //
165 // Without this, completions remain stuck in kernel's internal buffer
166 // after draining CQ, causing missing completion notifications.
167 let need_syscall_for_overflow = sq_cq_overflow && self.params.is_feature_nodrop();
168
169 if want > 0 || self.params.is_setup_iopoll() || sq_cq_overflow {
170 flags.insert(EnterFlags::GETEVENTS);
171 }
172
173 if self.params.is_setup_sqpoll() {
174 // See discussion in [`SubmissionQueue::need_wakeup`].
175 atomic::fence(atomic::Ordering::SeqCst);
176 if self.sq_need_wakeup() {
177 flags.insert(EnterFlags::SQ_WAKEUP);
178 } else if want == 0 && !need_syscall_for_overflow {
179 // The kernel thread is polling and hasn't fallen asleep, so we don't need to tell
180 // it to process events or wake it up
181
182 // However, if the CQ ring is overflown, we need to tell the kernel to process events
183 // by calling io_uring_enter with the IORING_ENTER_GETEVENTS flag.
184 return Ok(len);
185 }
186 }
187
188 unsafe { self.enter::<libc::sigset_t>(len as _, want as _, flags.bits(), None) }
189 }
190
191 /// Submit all queued submission queue events to the kernel and wait for at least `want`
192 /// completion events to complete with additional options
193 ///
194 /// You can specify a set of signals to mask and a timeout for operation, see
195 /// [`SubmitArgs`](types::SubmitArgs) for more details
196 pub fn submit_with_args(
197 &self,
198 want: usize,
199 args: &types::SubmitArgs<'_, '_>,
200 ) -> io::Result<usize> {
201 let len = self.sq_len();
202 let mut flags = EnterFlags::EXT_ARG;
203
204 let sq_cq_overflow = self.sq_cq_overflow();
205 let need_syscall = sq_cq_overflow & self.params.is_feature_nodrop();
206
207 if want > 0 || self.params.is_setup_iopoll() || sq_cq_overflow {
208 flags.insert(EnterFlags::GETEVENTS);
209 }
210
211 if self.params.is_setup_sqpoll() {
212 // See discussion in [`SubmissionQueue::need_wakeup`].
213 atomic::fence(atomic::Ordering::SeqCst);
214 if self.sq_need_wakeup() {
215 flags.insert(EnterFlags::SQ_WAKEUP);
216 } else if want == 0 && !need_syscall {
217 // The kernel thread is polling and hasn't fallen asleep, so we don't need to tell
218 // it to process events or wake it up
219 return Ok(len);
220 }
221 }
222
223 unsafe { self.enter(len as _, want as _, flags.bits(), Some(args)) }
224 }
225
226 /// Wait for the submission queue to have free entries.
227 pub fn squeue_wait(&self) -> io::Result<usize> {
228 unsafe { self.enter::<libc::sigset_t>(0, 0, EnterFlags::SQ_WAIT.bits(), None) }
229 }
230
231 /// Register in-memory fixed buffers for I/O with the kernel. You can use these buffers with the
232 /// [`ReadFixed`](crate::opcode::ReadFixed) and [`WriteFixed`](crate::opcode::WriteFixed)
233 /// operations.
234 ///
235 /// # Safety
236 ///
237 /// Developers must ensure that the `iov_base` and `iov_len` values are valid and will
238 /// be valid until buffers are unregistered or the ring destroyed, otherwise undefined
239 /// behaviour may occur.
240 pub unsafe fn register_buffers(&self, bufs: &[libc::iovec]) -> io::Result<()> {
241 execute(
242 self.fd.as_raw_fd(),
243 sys::IORING_REGISTER_BUFFERS,
244 bufs.as_ptr().cast(),
245 bufs.len() as _,
246 )
247 .map(drop)
248 }
249
250 /// Update a range of fixed buffers starting at `offset`.
251 ///
252 /// This is required to use buffers registered using
253 /// [`register_buffers_sparse`](Self::register_buffers_sparse),
254 /// although it can be also be used with [`register_buffers`](Self::register_buffers).
255 ///
256 /// See [`register_buffers2`](Self::register_buffers2)
257 /// for more information about resource tagging.
258 ///
259 /// Available since Linux 5.13.
260 ///
261 /// # Safety
262 ///
263 /// Developers must ensure that the `iov_base` and `iov_len` values are valid and will
264 /// be valid until buffers are unregistered or the ring destroyed, otherwise undefined
265 /// behaviour may occur.
266 pub unsafe fn register_buffers_update(
267 &self,
268 offset: u32,
269 bufs: &[libc::iovec],
270 tags: Option<&[u64]>,
271 ) -> io::Result<()> {
272 let nr = tags
273 .as_ref()
274 .map_or(bufs.len(), |tags| bufs.len().min(tags.len()));
275
276 let rr = sys::io_uring_rsrc_update2 {
277 nr: nr as _,
278 data: bufs.as_ptr() as _,
279 tags: tags.map(|tags| tags.as_ptr() as _).unwrap_or(0),
280 offset,
281 ..Default::default()
282 };
283
284 execute(
285 self.fd.as_raw_fd(),
286 sys::IORING_REGISTER_BUFFERS_UPDATE,
287 cast_ptr::<sys::io_uring_rsrc_update2>(&rr).cast(),
288 std::mem::size_of::<sys::io_uring_rsrc_update2>() as _,
289 )
290 .map(drop)
291 }
292
293 /// Variant of [`register_buffers`](Self::register_buffers)
294 /// with resource tagging.
295 ///
296 /// `tags` should be the same length as `bufs` and contain the
297 /// tag value corresponding to the buffer at the same index.
298 ///
299 /// If a tag is zero, then tagging for this particular resource
300 /// (a buffer in this case) is disabled. Otherwise, after the
301 /// resource had been unregistered and it's not used anymore,
302 /// a CQE will be posted with `user_data` set to the specified
303 /// tag and all other fields zeroed.
304 ///
305 /// Available since Linux 5.13.
306 ///
307 /// # Safety
308 ///
309 /// Developers must ensure that the `iov_base` and `iov_len` values are valid and will
310 /// be valid until buffers are unregistered or the ring destroyed, otherwise undefined
311 /// behaviour may occur.
312 pub unsafe fn register_buffers2(&self, bufs: &[libc::iovec], tags: &[u64]) -> io::Result<()> {
313 let rr = sys::io_uring_rsrc_register {
314 nr: bufs.len().min(tags.len()) as _,
315 data: bufs.as_ptr() as _,
316 tags: tags.as_ptr() as _,
317 ..Default::default()
318 };
319 execute(
320 self.fd.as_raw_fd(),
321 sys::IORING_REGISTER_BUFFERS2,
322 cast_ptr::<sys::io_uring_rsrc_register>(&rr).cast(),
323 std::mem::size_of::<sys::io_uring_rsrc_register>() as _,
324 )
325 .map(drop)
326 }
327
328 /// Registers an empty table of nr fixed buffers buffers.
329 ///
330 /// These must be updated before use, using eg.
331 /// [`register_buffers_update`](Self::register_buffers_update).
332 ///
333 /// See [`register_buffers`](Self::register_buffers)
334 /// for more information about fixed buffers.
335 ///
336 /// Available since Linux 5.13.
337 pub fn register_buffers_sparse(&self, nr: u32) -> io::Result<()> {
338 let rr = sys::io_uring_rsrc_register {
339 nr,
340 flags: sys::IORING_RSRC_REGISTER_SPARSE,
341 ..Default::default()
342 };
343 execute(
344 self.fd.as_raw_fd(),
345 sys::IORING_REGISTER_BUFFERS2,
346 cast_ptr::<sys::io_uring_rsrc_register>(&rr).cast(),
347 std::mem::size_of::<sys::io_uring_rsrc_register>() as _,
348 )
349 .map(drop)
350 }
351
352 /// Clone the entire registered buffer table from another ring into this one.
353 ///
354 /// `src_fd` is the raw file descriptor of the source `io_uring`. The source's
355 /// buffers are shared with this ring rather than copied, so a single physical
356 /// registration can back many rings without re-pinning the pages in the kernel.
357 ///
358 /// This ring's buffer table must be empty. To clone into a non-empty table or
359 /// to copy a sub-range, use
360 /// [`register_buffers_clone_offset`](Self::register_buffers_clone_offset).
361 ///
362 /// Available since Linux 6.12.
363 pub fn register_buffers_clone(&self, src_fd: RawFd) -> io::Result<()> {
364 self.register_buffers_clone_offset(src_fd, 0, 0, 0, CloneBuffersFlags::empty())
365 }
366
367 /// Clone a range of the registered buffer table from another ring into this one.
368 ///
369 /// `src_fd` is the raw file descriptor of the source `io_uring`. `nr` buffers
370 /// starting at `src_off` in the source table are installed starting at `dst_off`
371 /// in this ring's table. A `nr` of `0` clones the source's entire table.
372 ///
373 /// See [`CloneBuffersFlags`] for replacing an existing destination range or
374 /// treating `src_fd` as a registered ring descriptor.
375 ///
376 /// Available since Linux 6.12.
377 pub fn register_buffers_clone_offset(
378 &self,
379 src_fd: RawFd,
380 src_off: u32,
381 dst_off: u32,
382 nr: u32,
383 flags: CloneBuffersFlags,
384 ) -> io::Result<()> {
385 let arg = sys::io_uring_clone_buffers {
386 src_fd: src_fd as _,
387 flags: flags.bits(),
388 src_off,
389 dst_off,
390 nr,
391 ..Default::default()
392 };
393 execute(
394 self.fd.as_raw_fd(),
395 sys::IORING_REGISTER_CLONE_BUFFERS,
396 cast_ptr::<sys::io_uring_clone_buffers>(&arg).cast(),
397 // This opcode takes a single struct; the kernel requires nr_args == 1.
398 1,
399 )
400 .map(drop)
401 }
402
403 /// Registers an empty file table of nr_files number of file descriptors. The sparse variant is
404 /// available in kernels 5.19 and later.
405 ///
406 /// Registering a file table is a prerequisite for using any request that
407 /// uses direct descriptors.
408 pub fn register_files_sparse(&self, nr: u32) -> io::Result<()> {
409 let rr = sys::io_uring_rsrc_register {
410 nr,
411 flags: sys::IORING_RSRC_REGISTER_SPARSE,
412 resv2: 0,
413 data: 0,
414 tags: 0,
415 };
416 execute(
417 self.fd.as_raw_fd(),
418 sys::IORING_REGISTER_FILES2,
419 cast_ptr::<sys::io_uring_rsrc_register>(&rr).cast(),
420 mem::size_of::<sys::io_uring_rsrc_register>() as _,
421 )
422 .map(drop)
423 }
424
425 /// Register files for I/O. You can use the registered files with
426 /// [`Fixed`](crate::types::Fixed).
427 ///
428 /// Each fd may be -1, in which case it is considered "sparse", and can be filled in later with
429 /// [`register_files_update`](Self::register_files_update).
430 ///
431 /// Note that this will wait for the ring to idle; it will only return once all active requests
432 /// are complete. Use [`register_files_update`](Self::register_files_update) to avoid this.
433 pub fn register_files(&self, fds: &[RawFd]) -> io::Result<()> {
434 execute(
435 self.fd.as_raw_fd(),
436 sys::IORING_REGISTER_FILES,
437 fds.as_ptr().cast(),
438 fds.len() as _,
439 )
440 .map(drop)
441 }
442
443 /// This operation replaces existing files in the registered file set with new ones,
444 /// either turning a sparse entry (one where fd is equal to -1) into a real one, removing an existing entry (new one is set to -1),
445 /// or replacing an existing entry with a new existing entry. The `offset` parameter specifies
446 /// the offset into the list of registered files at which to start updating files.
447 ///
448 /// You can also perform this asynchronously with the
449 /// [`FilesUpdate`](crate::opcode::FilesUpdate) opcode.
450 pub fn register_files_update(&self, offset: u32, fds: &[RawFd]) -> io::Result<usize> {
451 let fu = sys::io_uring_files_update {
452 offset,
453 resv: 0,
454 fds: fds.as_ptr() as _,
455 };
456 let ret = execute(
457 self.fd.as_raw_fd(),
458 sys::IORING_REGISTER_FILES_UPDATE,
459 cast_ptr::<sys::io_uring_files_update>(&fu).cast(),
460 fds.len() as _,
461 )?;
462 Ok(ret as _)
463 }
464
465 /// Register an eventfd created by [`eventfd`](libc::eventfd) with the io_uring instance.
466 pub fn register_eventfd(&self, eventfd: RawFd) -> io::Result<()> {
467 execute(
468 self.fd.as_raw_fd(),
469 sys::IORING_REGISTER_EVENTFD,
470 cast_ptr::<RawFd>(&eventfd).cast(),
471 1,
472 )
473 .map(drop)
474 }
475
476 /// This works just like [`register_eventfd`](Self::register_eventfd), except notifications are
477 /// only posted for events that complete in an async manner, so requests that complete
478 /// immediately will not cause a notification.
479 pub fn register_eventfd_async(&self, eventfd: RawFd) -> io::Result<()> {
480 execute(
481 self.fd.as_raw_fd(),
482 sys::IORING_REGISTER_EVENTFD_ASYNC,
483 cast_ptr::<RawFd>(&eventfd).cast(),
484 1,
485 )
486 .map(drop)
487 }
488
489 /// Fill in the given [`Probe`] with information about the opcodes supported by io_uring on the
490 /// running kernel.
491 ///
492 /// # Examples
493 ///
494 // This is marked no_run as it is only available from Linux 5.6+, however the latest Ubuntu (on
495 // which CI runs) only has Linux 5.4.
496 /// ```no_run
497 /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
498 /// let io_uring = io_uring::IoUring::new(1)?;
499 /// let mut probe = io_uring::Probe::new();
500 /// io_uring.submitter().register_probe(&mut probe)?;
501 ///
502 /// if probe.is_supported(io_uring::opcode::Read::CODE) {
503 /// println!("Reading is supported!");
504 /// }
505 /// # Ok(())
506 /// # }
507 /// ```
508 pub fn register_probe(&self, probe: &mut Probe) -> io::Result<()> {
509 execute(
510 self.fd.as_raw_fd(),
511 sys::IORING_REGISTER_PROBE,
512 probe.as_mut_ptr() as *const _,
513 Probe::COUNT as _,
514 )
515 .map(drop)
516 }
517
518 /// Register credentials of the running application with io_uring, and get an id associated with
519 /// these credentials. This ID can then be [passed](crate::squeue::Entry::personality) into
520 /// submission queue entries to issue the request with this process' credentials.
521 ///
522 /// By default, if [`Parameters::is_feature_cur_personality`] is set then requests will use the
523 /// credentials of the task that called [`Submitter::enter`], otherwise they will use the
524 /// credentials of the task that originally registered the io_uring.
525 ///
526 /// [`Parameters::is_feature_cur_personality`]: crate::Parameters::is_feature_cur_personality
527 pub fn register_personality(&self) -> io::Result<u16> {
528 let id = execute(
529 self.fd.as_raw_fd(),
530 sys::IORING_REGISTER_PERSONALITY,
531 ptr::null(),
532 0,
533 )?;
534 Ok(id as u16)
535 }
536
537 /// Unregister all previously registered buffers.
538 ///
539 /// You do not need to explicitly call this before dropping the [`IoUring`](crate::IoUring), as
540 /// it will be cleaned up by the kernel automatically.
541 ///
542 /// Available since Linux 5.1.
543 pub fn unregister_buffers(&self) -> io::Result<()> {
544 execute(
545 self.fd.as_raw_fd(),
546 sys::IORING_UNREGISTER_BUFFERS,
547 ptr::null(),
548 0,
549 )
550 .map(drop)
551 }
552
553 /// Unregister all previously registered files.
554 ///
555 /// You do not need to explicitly call this before dropping the [`IoUring`](crate::IoUring), as
556 /// it will be cleaned up by the kernel automatically.
557 pub fn unregister_files(&self) -> io::Result<()> {
558 execute(
559 self.fd.as_raw_fd(),
560 sys::IORING_UNREGISTER_FILES,
561 ptr::null(),
562 0,
563 )
564 .map(drop)
565 }
566
567 /// Unregister an eventfd file descriptor to stop notifications.
568 pub fn unregister_eventfd(&self) -> io::Result<()> {
569 execute(
570 self.fd.as_raw_fd(),
571 sys::IORING_UNREGISTER_EVENTFD,
572 ptr::null(),
573 0,
574 )
575 .map(drop)
576 }
577
578 /// Unregister a previously registered personality.
579 pub fn unregister_personality(&self, personality: u16) -> io::Result<()> {
580 execute(
581 self.fd.as_raw_fd(),
582 sys::IORING_UNREGISTER_PERSONALITY,
583 ptr::null(),
584 personality as _,
585 )
586 .map(drop)
587 }
588
589 /// Permanently install a feature allowlist. Once this has been called, attempting to perform
590 /// an operation not on the allowlist will fail with `-EACCES`.
591 ///
592 /// This can only be called once, to prevent untrusted code from removing restrictions.
593 pub fn register_restrictions(&self, res: &mut [Restriction]) -> io::Result<()> {
594 execute(
595 self.fd.as_raw_fd(),
596 sys::IORING_REGISTER_RESTRICTIONS,
597 res.as_mut_ptr().cast(),
598 res.len() as _,
599 )
600 .map(drop)
601 }
602
603 /// Enable the rings of the io_uring instance if they have been disabled with
604 /// [`setup_r_disabled`](crate::Builder::setup_r_disabled).
605 pub fn register_enable_rings(&self) -> io::Result<()> {
606 execute(
607 self.fd.as_raw_fd(),
608 sys::IORING_REGISTER_ENABLE_RINGS,
609 ptr::null(),
610 0,
611 )
612 .map(drop)
613 }
614
615 /// Tell io_uring on what CPUs the async workers can run. By default, async workers
616 /// created by io_uring will inherit the CPU mask of its parent. This is usually
617 /// all the CPUs in the system, unless the parent is being run with a limited set.
618 pub fn register_iowq_aff(&self, cpu_set: &libc::cpu_set_t) -> io::Result<()> {
619 execute(
620 self.fd.as_raw_fd(),
621 sys::IORING_REGISTER_IOWQ_AFF,
622 cpu_set as *const _ as *const libc::c_void,
623 mem::size_of::<libc::cpu_set_t>() as u32,
624 )
625 .map(drop)
626 }
627
628 /// Undoes a CPU mask previously set with register_iowq_aff
629 pub fn unregister_iowq_aff(&self) -> io::Result<()> {
630 execute(
631 self.fd.as_raw_fd(),
632 sys::IORING_UNREGISTER_IOWQ_AFF,
633 ptr::null(),
634 0,
635 )
636 .map(drop)
637 }
638
639 /// Get and/or set the limit for number of io_uring worker threads per NUMA
640 /// node. `max[0]` holds the limit for bounded workers, which process I/O
641 /// operations expected to be bound in time, that is I/O on regular files or
642 /// block devices. While `max[1]` holds the limit for unbounded workers,
643 /// which carry out I/O operations that can never complete, for instance I/O
644 /// on sockets. Passing `0` does not change the current limit. Returns
645 /// previous limits on success.
646 pub fn register_iowq_max_workers(&self, max: &mut [u32; 2]) -> io::Result<()> {
647 execute(
648 self.fd.as_raw_fd(),
649 sys::IORING_REGISTER_IOWQ_MAX_WORKERS,
650 max.as_mut_ptr().cast(),
651 max.len() as _,
652 )
653 .map(drop)
654 }
655
656 /// Register NAPI busy-poll settings on this ring.
657 ///
658 /// The kernel writes the previous settings back into `napi` before applying the new
659 /// ones; read them back with [`Napi::busy_poll_timeout`] and
660 /// [`Napi::prefer_busy_poll`].
661 ///
662 /// Available since Linux 6.9.
663 pub fn register_napi(&self, napi: &mut Napi) -> io::Result<()> {
664 execute(
665 self.fd.as_raw_fd(),
666 sys::IORING_REGISTER_NAPI,
667 napi.as_mut_ptr().cast(),
668 1,
669 )
670 .map(drop)
671 }
672
673 /// Unregister NAPI busy-poll from this ring.
674 ///
675 /// The kernel writes the current settings back into `napi` before disabling them;
676 /// read them back with [`Napi::busy_poll_timeout`] and [`Napi::prefer_busy_poll`]. A
677 /// valid buffer is required, as the kernel rejects a null argument with `EINVAL`.
678 ///
679 /// Available since Linux 6.9.
680 pub fn unregister_napi(&self, napi: &mut Napi) -> io::Result<()> {
681 execute(
682 self.fd.as_raw_fd(),
683 sys::IORING_UNREGISTER_NAPI,
684 napi.as_mut_ptr().cast(),
685 1,
686 )
687 .map(drop)
688 }
689
690 /// Add a NAPI id to this ring's statically tracked busy-poll set.
691 ///
692 /// The ring must already be registered with [`NapiTracking::Static`]; otherwise the
693 /// kernel returns an error. `napi_id` identifies a NIC receive-queue NAPI instance,
694 /// typically obtained from a socket via the `SO_INCOMING_NAPI_ID` socket option.
695 ///
696 /// [`NapiTracking::Static`]: crate::types::NapiTracking::Static
697 ///
698 /// Available since Linux 6.13.
699 pub fn register_napi_add_id(&self, napi_id: u32) -> io::Result<()> {
700 self.register_napi_static_op(sys::IO_URING_NAPI_STATIC_ADD_ID as _, napi_id)
701 }
702
703 /// Remove a NAPI id from this ring's statically tracked busy-poll set.
704 ///
705 /// The ring must already be registered with [`NapiTracking::Static`]; otherwise the
706 /// kernel returns an error. See [`register_napi_add_id`](Self::register_napi_add_id).
707 ///
708 /// [`NapiTracking::Static`]: crate::types::NapiTracking::Static
709 ///
710 /// Available since Linux 6.13.
711 pub fn register_napi_del_id(&self, napi_id: u32) -> io::Result<()> {
712 self.register_napi_static_op(sys::IO_URING_NAPI_STATIC_DEL_ID as _, napi_id)
713 }
714
715 fn register_napi_static_op(&self, opcode: u8, napi_id: u32) -> io::Result<()> {
716 // Both ops are issued through IORING_REGISTER_NAPI, distinguished by `opcode`,
717 // with the NAPI id carried in `op_param`. The kernel writes the current settings
718 // back into the struct, so pass a mutable pointer even though we discard them.
719 let mut arg = sys::io_uring_napi {
720 opcode,
721 op_param: napi_id,
722 ..Default::default()
723 };
724 execute(
725 self.fd.as_raw_fd(),
726 sys::IORING_REGISTER_NAPI,
727 (&mut arg as *mut sys::io_uring_napi).cast(),
728 1,
729 )
730 .map(drop)
731 }
732
733 /// Register buffer ring for provided buffers.
734 ///
735 /// Details can be found in the io_uring_register_buf_ring.3 man page.
736 ///
737 /// If the register command is not supported, or the ring_entries value exceeds
738 /// 32768, the InvalidInput error is returned.
739 ///
740 /// Available since 5.19.
741 ///
742 /// # Safety
743 ///
744 /// Developers must ensure that the `ring_addr` and its length represented by `ring_entries`
745 /// are valid and will be valid until the bgid is unregistered or the ring destroyed,
746 /// otherwise undefined behaviour may occur.
747 #[deprecated(note = "please use `register_buf_ring_with_flags` instead")]
748 pub unsafe fn register_buf_ring(
749 &self,
750 ring_addr: u64,
751 ring_entries: u16,
752 bgid: u16,
753 ) -> io::Result<()> {
754 self.register_buf_ring_with_flags(ring_addr, ring_entries, bgid, 0)
755 }
756
757 /// Register buffer ring for provided buffers.
758 ///
759 /// Details can be found in the io_uring_register_buf_ring.3 man page.
760 ///
761 /// If the register command is not supported, or the ring_entries value exceeds
762 /// 32768, the InvalidInput error is returned.
763 ///
764 /// Available since 5.19.
765 ///
766 /// # Safety
767 ///
768 /// Developers must ensure that the `ring_addr` and its length represented by `ring_entries`
769 /// are valid and will be valid until the bgid is unregistered or the ring destroyed,
770 /// otherwise undefined behaviour may occur.
771 pub unsafe fn register_buf_ring_with_flags(
772 &self,
773 ring_addr: u64,
774 ring_entries: u16,
775 bgid: u16,
776 flags: u16,
777 ) -> io::Result<()> {
778 // The interface type for ring_entries is u32 but the same interface only allows a u16 for
779 // the tail to be specified, so to try and avoid further confusion, we limit the
780 // ring_entries to u16 here too. The value is actually limited to 2^15 (32768) but we can
781 // let the kernel enforce that.
782 let arg = sys::io_uring_buf_reg {
783 ring_addr,
784 ring_entries: ring_entries as _,
785 bgid,
786 flags,
787 ..Default::default()
788 };
789 execute(
790 self.fd.as_raw_fd(),
791 sys::IORING_REGISTER_PBUF_RING,
792 cast_ptr::<sys::io_uring_buf_reg>(&arg).cast(),
793 1,
794 )
795 .map(drop)
796 }
797
798 /// Unregister a previously registered buffer ring.
799 ///
800 /// Available since 5.19.
801 pub fn unregister_buf_ring(&self, bgid: u16) -> io::Result<()> {
802 let arg = sys::io_uring_buf_reg {
803 ring_addr: 0,
804 ring_entries: 0,
805 bgid,
806 ..Default::default()
807 };
808 execute(
809 self.fd.as_raw_fd(),
810 sys::IORING_UNREGISTER_PBUF_RING,
811 cast_ptr::<sys::io_uring_buf_reg>(&arg).cast(),
812 1,
813 )
814 .map(drop)
815 }
816
817 /// Performs a synchronous cancellation request, similar to [AsyncCancel](crate::opcode::AsyncCancel),
818 /// except that it completes synchronously.
819 ///
820 /// Cancellation can target a specific request, or all requests matching some criteria. The
821 /// [`CancelBuilder`] builder supports describing the match criteria for cancellation.
822 ///
823 /// An optional `timeout` can be provided to specify how long to wait for matched requests to be
824 /// canceled. If no timeout is provided, the default is to wait indefinitely.
825 ///
826 /// ### Errors
827 ///
828 /// If no requests are matched, returns:
829 ///
830 /// [io::ErrorKind::NotFound]: `No such file or directory (os error 2)`
831 ///
832 /// If a timeout is supplied, and the timeout elapses prior to all requests being canceled, returns:
833 ///
834 /// [io::ErrorKind::Uncategorized]: `Timer expired (os error 62)`
835 ///
836 /// ### Notes
837 ///
838 /// Only requests which have been submitted to the ring will be considered for cancellation. Requests
839 /// which have been written to the SQ, but not submitted, will not be canceled.
840 ///
841 /// Available since 6.0.
842 pub fn register_sync_cancel(
843 &self,
844 timeout: Option<Timespec>,
845 builder: CancelBuilder,
846 ) -> io::Result<()> {
847 let timespec = timeout.map(|ts| ts.0).unwrap_or(sys::__kernel_timespec {
848 tv_sec: -1,
849 tv_nsec: -1,
850 });
851 let user_data = builder.user_data.unwrap_or(0);
852 let flags = builder.flags.bits();
853 let fd = builder.to_fd();
854
855 let arg = sys::io_uring_sync_cancel_reg {
856 addr: user_data,
857 fd,
858 flags,
859 timeout: timespec,
860 ..Default::default()
861 };
862
863 execute(
864 self.fd.as_raw_fd(),
865 sys::IORING_REGISTER_SYNC_CANCEL,
866 cast_ptr::<sys::io_uring_sync_cancel_reg>(&arg).cast(),
867 1,
868 )
869 .map(drop)
870 }
871
872 /// Register a netdev hw rx queue for zerocopy.
873 ///
874 /// Available since 6.15.
875 pub fn register_ifq(&self, reg: &sys::io_uring_zcrx_ifq_reg) -> io::Result<()> {
876 execute(
877 self.fd.as_raw_fd(),
878 sys::IORING_REGISTER_ZCRX_IFQ,
879 cast_ptr::<sys::io_uring_zcrx_ifq_reg>(reg) as _,
880 1,
881 )
882 .map(drop)
883 }
884}