Skip to main content

io_uring/
submit.rs

1use std::os::unix::io::{AsRawFd, RawFd};
2use std::sync::atomic;
3use std::{io, mem, ptr};
4
5use crate::register::{execute, Probe};
6use crate::sys;
7use crate::types::{CancelBuilder, CloneBuffersFlags, Napi, Timespec};
8use crate::util::{cast_ptr, OwnedFd};
9use crate::Parameters;
10use bitflags::bitflags;
11
12use crate::register::Restriction;
13
14use crate::types;
15
16bitflags!(
17    /// See man page for complete description:
18    /// https://man7.org/linux/man-pages/man2/io_uring_enter.2.html
19    #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
20    pub struct EnterFlags: u32 {
21        /// Wait for at least `min_complete` events to complete.
22        const GETEVENTS = sys::IORING_ENTER_GETEVENTS;
23
24        /// If the kernel thread is sleeping, wake it up.
25        const SQ_WAKEUP = sys::IORING_ENTER_SQ_WAKEUP;
26
27        /// Wait for at least one submission queue entry to be available.
28        const SQ_WAIT = sys::IORING_ENTER_SQ_WAIT;
29
30        /// Use the extended argument structure.
31        const EXT_ARG = sys::IORING_ENTER_EXT_ARG;
32
33        /// Submit using registered submission queue ring.
34        const REGISTERED_RING = sys::IORING_ENTER_REGISTERED_RING;
35
36        /// Timeout argument interpreted as absolute time.
37        const ABS_TIMER = sys::IORING_ENTER_ABS_TIMER;
38
39        /// Arg is offset into an area of wait regions previously registered.
40        const EXT_ARG_REG = sys::IORING_ENTER_EXT_ARG_REG;
41
42        /// Don't mark waiting task as being in iowait in certain cases.
43        const NO_IOWAIT = sys::IORING_ENTER_NO_IOWAIT;
44    }
45);
46
47/// Interface for submitting submission queue events in an io_uring instance to the kernel for
48/// executing and registering files or buffers with the instance.
49///
50/// io_uring supports both directly performing I/O on buffers and file descriptors and registering
51/// them beforehand. Registering is slow, but it makes performing the actual I/O much faster.
52pub struct Submitter<'a> {
53    fd: &'a OwnedFd,
54    params: &'a Parameters,
55
56    sq_head: *const atomic::AtomicU32,
57    sq_tail: *const atomic::AtomicU32,
58    sq_flags: *const atomic::AtomicU32,
59}
60
61impl<'a> Submitter<'a> {
62    #[inline]
63    pub(crate) const fn new(
64        fd: &'a OwnedFd,
65        params: &'a Parameters,
66        sq_head: *const atomic::AtomicU32,
67        sq_tail: *const atomic::AtomicU32,
68        sq_flags: *const atomic::AtomicU32,
69    ) -> Submitter<'a> {
70        Submitter {
71            fd,
72            params,
73            sq_head,
74            sq_tail,
75            sq_flags,
76        }
77    }
78
79    #[inline]
80    fn sq_len(&self) -> usize {
81        unsafe {
82            let head = (*self.sq_head).load(atomic::Ordering::Acquire);
83            let tail = (*self.sq_tail).load(atomic::Ordering::Acquire);
84
85            tail.wrapping_sub(head) as usize
86        }
87    }
88
89    /// Whether the kernel thread has gone to sleep because it waited for too long without
90    /// submission queue entries.
91    #[inline]
92    fn sq_need_wakeup(&self) -> bool {
93        unsafe {
94            (*self.sq_flags).load(atomic::Ordering::Relaxed) & sys::IORING_SQ_NEED_WAKEUP != 0
95        }
96    }
97
98    /// CQ ring is overflown
99    fn sq_cq_overflow(&self) -> bool {
100        unsafe {
101            (*self.sq_flags).load(atomic::Ordering::Relaxed) & sys::IORING_SQ_CQ_OVERFLOW != 0
102        }
103    }
104
105    /// Initiate and/or complete asynchronous I/O. This is a low-level wrapper around
106    /// `io_uring_enter` - see `man io_uring_enter` (or [its online
107    /// version](https://manpages.debian.org/unstable/liburing-dev/io_uring_enter.2.en.html) for
108    /// more details.
109    ///
110    /// You will probably want to use a more high-level API such as
111    /// [`submit`](Self::submit) or [`submit_and_wait`](Self::submit_and_wait).
112    ///
113    /// # Safety
114    ///
115    /// This provides a raw interface so developer must ensure that parameters are correct.
116    pub unsafe fn enter<T: Sized>(
117        &self,
118        to_submit: u32,
119        min_complete: u32,
120        flag: u32,
121        arg: Option<&T>,
122    ) -> io::Result<usize> {
123        let arg = arg
124            .map(|arg| cast_ptr(arg).cast())
125            .unwrap_or_else(ptr::null);
126        let size = mem::size_of::<T>();
127        sys::io_uring_enter(
128            self.fd.as_raw_fd(),
129            to_submit,
130            min_complete,
131            flag,
132            arg,
133            size,
134        )
135        .map(|res| res as _)
136    }
137
138    /// Submit all queued submission queue events to the kernel.
139    #[inline]
140    pub fn submit(&self) -> io::Result<usize> {
141        self.submit_and_wait(0)
142    }
143
144    /// Submit all queued submission queue events to the kernel and wait for at least `want`
145    /// completion events to complete.
146    pub fn submit_and_wait(&self, want: usize) -> io::Result<usize> {
147        let len = self.sq_len();
148        let mut flags = EnterFlags::empty();
149
150        // This logic suffers from the fact the sq_cq_overflow and sq_need_wakeup
151        // each cause an atomic load of the same variable, self.sq_flags.
152        // In the hottest paths, when a server is running with sqpoll,
153        // this is going to be hit twice, when once would be sufficient.
154        // However, consider that the `SeqCst` barrier required for interpreting
155        // the IORING_ENTER_SQ_WAKEUP bit is required in all paths where sqpoll
156        // is setup when consolidating the reads.
157
158        let sq_cq_overflow = self.sq_cq_overflow();
159
160        // When IORING_FEAT_NODROP is enabled and CQ overflows, the kernel buffers
161        // completion events internally but doesn't automatically flush them when
162        // CQ space becomes available. We must explicitly call io_uring_enter()
163        // to flush these buffered events, even with SQPOLL enabled.
164        //
165        // Without this, completions remain stuck in kernel's internal buffer
166        // after draining CQ, causing missing completion notifications.
167        let need_syscall_for_overflow = sq_cq_overflow && self.params.is_feature_nodrop();
168
169        if want > 0 || self.params.is_setup_iopoll() || sq_cq_overflow {
170            flags.insert(EnterFlags::GETEVENTS);
171        }
172
173        if self.params.is_setup_sqpoll() {
174            // See discussion in [`SubmissionQueue::need_wakeup`].
175            atomic::fence(atomic::Ordering::SeqCst);
176            if self.sq_need_wakeup() {
177                flags.insert(EnterFlags::SQ_WAKEUP);
178            } else if want == 0 && !need_syscall_for_overflow {
179                // The kernel thread is polling and hasn't fallen asleep, so we don't need to tell
180                // it to process events or wake it up
181
182                // However, if the CQ ring is overflown, we need to tell the kernel to process events
183                // by calling io_uring_enter with the IORING_ENTER_GETEVENTS flag.
184                return Ok(len);
185            }
186        }
187
188        unsafe { self.enter::<libc::sigset_t>(len as _, want as _, flags.bits(), None) }
189    }
190
191    /// Submit all queued submission queue events to the kernel and wait for at least `want`
192    /// completion events to complete with additional options
193    ///
194    /// You can specify a set of signals to mask and a timeout for operation, see
195    /// [`SubmitArgs`](types::SubmitArgs) for more details
196    pub fn submit_with_args(
197        &self,
198        want: usize,
199        args: &types::SubmitArgs<'_, '_>,
200    ) -> io::Result<usize> {
201        let len = self.sq_len();
202        let mut flags = EnterFlags::EXT_ARG;
203
204        let sq_cq_overflow = self.sq_cq_overflow();
205        let need_syscall = sq_cq_overflow & self.params.is_feature_nodrop();
206
207        if want > 0 || self.params.is_setup_iopoll() || sq_cq_overflow {
208            flags.insert(EnterFlags::GETEVENTS);
209        }
210
211        if self.params.is_setup_sqpoll() {
212            // See discussion in [`SubmissionQueue::need_wakeup`].
213            atomic::fence(atomic::Ordering::SeqCst);
214            if self.sq_need_wakeup() {
215                flags.insert(EnterFlags::SQ_WAKEUP);
216            } else if want == 0 && !need_syscall {
217                // The kernel thread is polling and hasn't fallen asleep, so we don't need to tell
218                // it to process events or wake it up
219                return Ok(len);
220            }
221        }
222
223        unsafe { self.enter(len as _, want as _, flags.bits(), Some(args)) }
224    }
225
226    /// Wait for the submission queue to have free entries.
227    pub fn squeue_wait(&self) -> io::Result<usize> {
228        unsafe { self.enter::<libc::sigset_t>(0, 0, EnterFlags::SQ_WAIT.bits(), None) }
229    }
230
231    /// Register in-memory fixed buffers for I/O with the kernel. You can use these buffers with the
232    /// [`ReadFixed`](crate::opcode::ReadFixed) and [`WriteFixed`](crate::opcode::WriteFixed)
233    /// operations.
234    ///
235    /// # Safety
236    ///
237    /// Developers must ensure that the `iov_base` and `iov_len` values are valid and will
238    /// be valid until buffers are unregistered or the ring destroyed, otherwise undefined
239    /// behaviour may occur.
240    pub unsafe fn register_buffers(&self, bufs: &[libc::iovec]) -> io::Result<()> {
241        execute(
242            self.fd.as_raw_fd(),
243            sys::IORING_REGISTER_BUFFERS,
244            bufs.as_ptr().cast(),
245            bufs.len() as _,
246        )
247        .map(drop)
248    }
249
250    /// Update a range of fixed buffers starting at `offset`.
251    ///
252    /// This is required to use buffers registered using
253    /// [`register_buffers_sparse`](Self::register_buffers_sparse),
254    /// although it can be also be used with [`register_buffers`](Self::register_buffers).
255    ///
256    /// See [`register_buffers2`](Self::register_buffers2)
257    /// for more information about resource tagging.
258    ///
259    /// Available since Linux 5.13.
260    ///
261    /// # Safety
262    ///
263    /// Developers must ensure that the `iov_base` and `iov_len` values are valid and will
264    /// be valid until buffers are unregistered or the ring destroyed, otherwise undefined
265    /// behaviour may occur.
266    pub unsafe fn register_buffers_update(
267        &self,
268        offset: u32,
269        bufs: &[libc::iovec],
270        tags: Option<&[u64]>,
271    ) -> io::Result<()> {
272        let nr = tags
273            .as_ref()
274            .map_or(bufs.len(), |tags| bufs.len().min(tags.len()));
275
276        let rr = sys::io_uring_rsrc_update2 {
277            nr: nr as _,
278            data: bufs.as_ptr() as _,
279            tags: tags.map(|tags| tags.as_ptr() as _).unwrap_or(0),
280            offset,
281            ..Default::default()
282        };
283
284        execute(
285            self.fd.as_raw_fd(),
286            sys::IORING_REGISTER_BUFFERS_UPDATE,
287            cast_ptr::<sys::io_uring_rsrc_update2>(&rr).cast(),
288            std::mem::size_of::<sys::io_uring_rsrc_update2>() as _,
289        )
290        .map(drop)
291    }
292
293    /// Variant of [`register_buffers`](Self::register_buffers)
294    /// with resource tagging.
295    ///
296    /// `tags` should be the same length as `bufs` and contain the
297    /// tag value corresponding to the buffer at the same index.
298    ///
299    /// If a tag is zero, then tagging for this particular resource
300    /// (a buffer in this case) is disabled. Otherwise, after the
301    /// resource had been unregistered and it's not used anymore,
302    /// a CQE will be posted with `user_data` set to the specified
303    /// tag and all other fields zeroed.
304    ///
305    /// Available since Linux 5.13.
306    ///
307    /// # Safety
308    ///
309    /// Developers must ensure that the `iov_base` and `iov_len` values are valid and will
310    /// be valid until buffers are unregistered or the ring destroyed, otherwise undefined
311    /// behaviour may occur.
312    pub unsafe fn register_buffers2(&self, bufs: &[libc::iovec], tags: &[u64]) -> io::Result<()> {
313        let rr = sys::io_uring_rsrc_register {
314            nr: bufs.len().min(tags.len()) as _,
315            data: bufs.as_ptr() as _,
316            tags: tags.as_ptr() as _,
317            ..Default::default()
318        };
319        execute(
320            self.fd.as_raw_fd(),
321            sys::IORING_REGISTER_BUFFERS2,
322            cast_ptr::<sys::io_uring_rsrc_register>(&rr).cast(),
323            std::mem::size_of::<sys::io_uring_rsrc_register>() as _,
324        )
325        .map(drop)
326    }
327
328    /// Registers an empty table of nr fixed buffers buffers.
329    ///
330    /// These must be updated before use, using eg.
331    /// [`register_buffers_update`](Self::register_buffers_update).
332    ///
333    /// See [`register_buffers`](Self::register_buffers)
334    /// for more information about fixed buffers.
335    ///
336    /// Available since Linux 5.13.
337    pub fn register_buffers_sparse(&self, nr: u32) -> io::Result<()> {
338        let rr = sys::io_uring_rsrc_register {
339            nr,
340            flags: sys::IORING_RSRC_REGISTER_SPARSE,
341            ..Default::default()
342        };
343        execute(
344            self.fd.as_raw_fd(),
345            sys::IORING_REGISTER_BUFFERS2,
346            cast_ptr::<sys::io_uring_rsrc_register>(&rr).cast(),
347            std::mem::size_of::<sys::io_uring_rsrc_register>() as _,
348        )
349        .map(drop)
350    }
351
352    /// Clone the entire registered buffer table from another ring into this one.
353    ///
354    /// `src_fd` is the raw file descriptor of the source `io_uring`. The source's
355    /// buffers are shared with this ring rather than copied, so a single physical
356    /// registration can back many rings without re-pinning the pages in the kernel.
357    ///
358    /// This ring's buffer table must be empty. To clone into a non-empty table or
359    /// to copy a sub-range, use
360    /// [`register_buffers_clone_offset`](Self::register_buffers_clone_offset).
361    ///
362    /// Available since Linux 6.12.
363    pub fn register_buffers_clone(&self, src_fd: RawFd) -> io::Result<()> {
364        self.register_buffers_clone_offset(src_fd, 0, 0, 0, CloneBuffersFlags::empty())
365    }
366
367    /// Clone a range of the registered buffer table from another ring into this one.
368    ///
369    /// `src_fd` is the raw file descriptor of the source `io_uring`. `nr` buffers
370    /// starting at `src_off` in the source table are installed starting at `dst_off`
371    /// in this ring's table. A `nr` of `0` clones the source's entire table.
372    ///
373    /// See [`CloneBuffersFlags`] for replacing an existing destination range or
374    /// treating `src_fd` as a registered ring descriptor.
375    ///
376    /// Available since Linux 6.12.
377    pub fn register_buffers_clone_offset(
378        &self,
379        src_fd: RawFd,
380        src_off: u32,
381        dst_off: u32,
382        nr: u32,
383        flags: CloneBuffersFlags,
384    ) -> io::Result<()> {
385        let arg = sys::io_uring_clone_buffers {
386            src_fd: src_fd as _,
387            flags: flags.bits(),
388            src_off,
389            dst_off,
390            nr,
391            ..Default::default()
392        };
393        execute(
394            self.fd.as_raw_fd(),
395            sys::IORING_REGISTER_CLONE_BUFFERS,
396            cast_ptr::<sys::io_uring_clone_buffers>(&arg).cast(),
397            // This opcode takes a single struct; the kernel requires nr_args == 1.
398            1,
399        )
400        .map(drop)
401    }
402
403    /// Registers an empty file table of nr_files number of file descriptors. The sparse variant is
404    /// available in kernels 5.19 and later.
405    ///
406    /// Registering a file table is a prerequisite for using any request that
407    /// uses direct descriptors.
408    pub fn register_files_sparse(&self, nr: u32) -> io::Result<()> {
409        let rr = sys::io_uring_rsrc_register {
410            nr,
411            flags: sys::IORING_RSRC_REGISTER_SPARSE,
412            resv2: 0,
413            data: 0,
414            tags: 0,
415        };
416        execute(
417            self.fd.as_raw_fd(),
418            sys::IORING_REGISTER_FILES2,
419            cast_ptr::<sys::io_uring_rsrc_register>(&rr).cast(),
420            mem::size_of::<sys::io_uring_rsrc_register>() as _,
421        )
422        .map(drop)
423    }
424
425    /// Register files for I/O. You can use the registered files with
426    /// [`Fixed`](crate::types::Fixed).
427    ///
428    /// Each fd may be -1, in which case it is considered "sparse", and can be filled in later with
429    /// [`register_files_update`](Self::register_files_update).
430    ///
431    /// Note that this will wait for the ring to idle; it will only return once all active requests
432    /// are complete. Use [`register_files_update`](Self::register_files_update) to avoid this.
433    pub fn register_files(&self, fds: &[RawFd]) -> io::Result<()> {
434        execute(
435            self.fd.as_raw_fd(),
436            sys::IORING_REGISTER_FILES,
437            fds.as_ptr().cast(),
438            fds.len() as _,
439        )
440        .map(drop)
441    }
442
443    /// This operation replaces existing files in the registered file set with new ones,
444    /// either turning a sparse entry (one where fd is equal to -1) into a real one, removing an existing entry (new one is set to -1),
445    /// or replacing an existing entry with a new existing entry. The `offset` parameter specifies
446    /// the offset into the list of registered files at which to start updating files.
447    ///
448    /// You can also perform this asynchronously with the
449    /// [`FilesUpdate`](crate::opcode::FilesUpdate) opcode.
450    pub fn register_files_update(&self, offset: u32, fds: &[RawFd]) -> io::Result<usize> {
451        let fu = sys::io_uring_files_update {
452            offset,
453            resv: 0,
454            fds: fds.as_ptr() as _,
455        };
456        let ret = execute(
457            self.fd.as_raw_fd(),
458            sys::IORING_REGISTER_FILES_UPDATE,
459            cast_ptr::<sys::io_uring_files_update>(&fu).cast(),
460            fds.len() as _,
461        )?;
462        Ok(ret as _)
463    }
464
465    /// Register an eventfd created by [`eventfd`](libc::eventfd) with the io_uring instance.
466    pub fn register_eventfd(&self, eventfd: RawFd) -> io::Result<()> {
467        execute(
468            self.fd.as_raw_fd(),
469            sys::IORING_REGISTER_EVENTFD,
470            cast_ptr::<RawFd>(&eventfd).cast(),
471            1,
472        )
473        .map(drop)
474    }
475
476    /// This works just like [`register_eventfd`](Self::register_eventfd), except notifications are
477    /// only posted for events that complete in an async manner, so requests that complete
478    /// immediately will not cause a notification.
479    pub fn register_eventfd_async(&self, eventfd: RawFd) -> io::Result<()> {
480        execute(
481            self.fd.as_raw_fd(),
482            sys::IORING_REGISTER_EVENTFD_ASYNC,
483            cast_ptr::<RawFd>(&eventfd).cast(),
484            1,
485        )
486        .map(drop)
487    }
488
489    /// Fill in the given [`Probe`] with information about the opcodes supported by io_uring on the
490    /// running kernel.
491    ///
492    /// # Examples
493    ///
494    // This is marked no_run as it is only available from Linux 5.6+, however the latest Ubuntu (on
495    // which CI runs) only has Linux 5.4.
496    /// ```no_run
497    /// # fn main() -> Result<(), Box<dyn std::error::Error>> {
498    /// let io_uring = io_uring::IoUring::new(1)?;
499    /// let mut probe = io_uring::Probe::new();
500    /// io_uring.submitter().register_probe(&mut probe)?;
501    ///
502    /// if probe.is_supported(io_uring::opcode::Read::CODE) {
503    ///     println!("Reading is supported!");
504    /// }
505    /// # Ok(())
506    /// # }
507    /// ```
508    pub fn register_probe(&self, probe: &mut Probe) -> io::Result<()> {
509        execute(
510            self.fd.as_raw_fd(),
511            sys::IORING_REGISTER_PROBE,
512            probe.as_mut_ptr() as *const _,
513            Probe::COUNT as _,
514        )
515        .map(drop)
516    }
517
518    /// Register credentials of the running application with io_uring, and get an id associated with
519    /// these credentials. This ID can then be [passed](crate::squeue::Entry::personality) into
520    /// submission queue entries to issue the request with this process' credentials.
521    ///
522    /// By default, if [`Parameters::is_feature_cur_personality`] is set then requests will use the
523    /// credentials of the task that called [`Submitter::enter`], otherwise they will use the
524    /// credentials of the task that originally registered the io_uring.
525    ///
526    /// [`Parameters::is_feature_cur_personality`]: crate::Parameters::is_feature_cur_personality
527    pub fn register_personality(&self) -> io::Result<u16> {
528        let id = execute(
529            self.fd.as_raw_fd(),
530            sys::IORING_REGISTER_PERSONALITY,
531            ptr::null(),
532            0,
533        )?;
534        Ok(id as u16)
535    }
536
537    /// Unregister all previously registered buffers.
538    ///
539    /// You do not need to explicitly call this before dropping the [`IoUring`](crate::IoUring), as
540    /// it will be cleaned up by the kernel automatically.
541    ///
542    /// Available since Linux 5.1.
543    pub fn unregister_buffers(&self) -> io::Result<()> {
544        execute(
545            self.fd.as_raw_fd(),
546            sys::IORING_UNREGISTER_BUFFERS,
547            ptr::null(),
548            0,
549        )
550        .map(drop)
551    }
552
553    /// Unregister all previously registered files.
554    ///
555    /// You do not need to explicitly call this before dropping the [`IoUring`](crate::IoUring), as
556    /// it will be cleaned up by the kernel automatically.
557    pub fn unregister_files(&self) -> io::Result<()> {
558        execute(
559            self.fd.as_raw_fd(),
560            sys::IORING_UNREGISTER_FILES,
561            ptr::null(),
562            0,
563        )
564        .map(drop)
565    }
566
567    /// Unregister an eventfd file descriptor to stop notifications.
568    pub fn unregister_eventfd(&self) -> io::Result<()> {
569        execute(
570            self.fd.as_raw_fd(),
571            sys::IORING_UNREGISTER_EVENTFD,
572            ptr::null(),
573            0,
574        )
575        .map(drop)
576    }
577
578    /// Unregister a previously registered personality.
579    pub fn unregister_personality(&self, personality: u16) -> io::Result<()> {
580        execute(
581            self.fd.as_raw_fd(),
582            sys::IORING_UNREGISTER_PERSONALITY,
583            ptr::null(),
584            personality as _,
585        )
586        .map(drop)
587    }
588
589    /// Permanently install a feature allowlist. Once this has been called, attempting to perform
590    /// an operation not on the allowlist will fail with `-EACCES`.
591    ///
592    /// This can only be called once, to prevent untrusted code from removing restrictions.
593    pub fn register_restrictions(&self, res: &mut [Restriction]) -> io::Result<()> {
594        execute(
595            self.fd.as_raw_fd(),
596            sys::IORING_REGISTER_RESTRICTIONS,
597            res.as_mut_ptr().cast(),
598            res.len() as _,
599        )
600        .map(drop)
601    }
602
603    /// Enable the rings of the io_uring instance if they have been disabled with
604    /// [`setup_r_disabled`](crate::Builder::setup_r_disabled).
605    pub fn register_enable_rings(&self) -> io::Result<()> {
606        execute(
607            self.fd.as_raw_fd(),
608            sys::IORING_REGISTER_ENABLE_RINGS,
609            ptr::null(),
610            0,
611        )
612        .map(drop)
613    }
614
615    /// Tell io_uring on what CPUs the async workers can run. By default, async workers
616    /// created by io_uring will inherit the CPU mask of its parent. This is usually
617    /// all the CPUs in the system, unless the parent is being run with a limited set.
618    pub fn register_iowq_aff(&self, cpu_set: &libc::cpu_set_t) -> io::Result<()> {
619        execute(
620            self.fd.as_raw_fd(),
621            sys::IORING_REGISTER_IOWQ_AFF,
622            cpu_set as *const _ as *const libc::c_void,
623            mem::size_of::<libc::cpu_set_t>() as u32,
624        )
625        .map(drop)
626    }
627
628    /// Undoes a CPU mask previously set with register_iowq_aff
629    pub fn unregister_iowq_aff(&self) -> io::Result<()> {
630        execute(
631            self.fd.as_raw_fd(),
632            sys::IORING_UNREGISTER_IOWQ_AFF,
633            ptr::null(),
634            0,
635        )
636        .map(drop)
637    }
638
639    /// Get and/or set the limit for number of io_uring worker threads per NUMA
640    /// node. `max[0]` holds the limit for bounded workers, which process I/O
641    /// operations expected to be bound in time, that is I/O on regular files or
642    /// block devices. While `max[1]` holds the limit for unbounded workers,
643    /// which carry out I/O operations that can never complete, for instance I/O
644    /// on sockets. Passing `0` does not change the current limit. Returns
645    /// previous limits on success.
646    pub fn register_iowq_max_workers(&self, max: &mut [u32; 2]) -> io::Result<()> {
647        execute(
648            self.fd.as_raw_fd(),
649            sys::IORING_REGISTER_IOWQ_MAX_WORKERS,
650            max.as_mut_ptr().cast(),
651            max.len() as _,
652        )
653        .map(drop)
654    }
655
656    /// Register NAPI busy-poll settings on this ring.
657    ///
658    /// The kernel writes the previous settings back into `napi` before applying the new
659    /// ones; read them back with [`Napi::busy_poll_timeout`] and
660    /// [`Napi::prefer_busy_poll`].
661    ///
662    /// Available since Linux 6.9.
663    pub fn register_napi(&self, napi: &mut Napi) -> io::Result<()> {
664        execute(
665            self.fd.as_raw_fd(),
666            sys::IORING_REGISTER_NAPI,
667            napi.as_mut_ptr().cast(),
668            1,
669        )
670        .map(drop)
671    }
672
673    /// Unregister NAPI busy-poll from this ring.
674    ///
675    /// The kernel writes the current settings back into `napi` before disabling them;
676    /// read them back with [`Napi::busy_poll_timeout`] and [`Napi::prefer_busy_poll`]. A
677    /// valid buffer is required, as the kernel rejects a null argument with `EINVAL`.
678    ///
679    /// Available since Linux 6.9.
680    pub fn unregister_napi(&self, napi: &mut Napi) -> io::Result<()> {
681        execute(
682            self.fd.as_raw_fd(),
683            sys::IORING_UNREGISTER_NAPI,
684            napi.as_mut_ptr().cast(),
685            1,
686        )
687        .map(drop)
688    }
689
690    /// Add a NAPI id to this ring's statically tracked busy-poll set.
691    ///
692    /// The ring must already be registered with [`NapiTracking::Static`]; otherwise the
693    /// kernel returns an error. `napi_id` identifies a NIC receive-queue NAPI instance,
694    /// typically obtained from a socket via the `SO_INCOMING_NAPI_ID` socket option.
695    ///
696    /// [`NapiTracking::Static`]: crate::types::NapiTracking::Static
697    ///
698    /// Available since Linux 6.13.
699    pub fn register_napi_add_id(&self, napi_id: u32) -> io::Result<()> {
700        self.register_napi_static_op(sys::IO_URING_NAPI_STATIC_ADD_ID as _, napi_id)
701    }
702
703    /// Remove a NAPI id from this ring's statically tracked busy-poll set.
704    ///
705    /// The ring must already be registered with [`NapiTracking::Static`]; otherwise the
706    /// kernel returns an error. See [`register_napi_add_id`](Self::register_napi_add_id).
707    ///
708    /// [`NapiTracking::Static`]: crate::types::NapiTracking::Static
709    ///
710    /// Available since Linux 6.13.
711    pub fn register_napi_del_id(&self, napi_id: u32) -> io::Result<()> {
712        self.register_napi_static_op(sys::IO_URING_NAPI_STATIC_DEL_ID as _, napi_id)
713    }
714
715    fn register_napi_static_op(&self, opcode: u8, napi_id: u32) -> io::Result<()> {
716        // Both ops are issued through IORING_REGISTER_NAPI, distinguished by `opcode`,
717        // with the NAPI id carried in `op_param`. The kernel writes the current settings
718        // back into the struct, so pass a mutable pointer even though we discard them.
719        let mut arg = sys::io_uring_napi {
720            opcode,
721            op_param: napi_id,
722            ..Default::default()
723        };
724        execute(
725            self.fd.as_raw_fd(),
726            sys::IORING_REGISTER_NAPI,
727            (&mut arg as *mut sys::io_uring_napi).cast(),
728            1,
729        )
730        .map(drop)
731    }
732
733    /// Register buffer ring for provided buffers.
734    ///
735    /// Details can be found in the io_uring_register_buf_ring.3 man page.
736    ///
737    /// If the register command is not supported, or the ring_entries value exceeds
738    /// 32768, the InvalidInput error is returned.
739    ///
740    /// Available since 5.19.
741    ///
742    /// # Safety
743    ///
744    /// Developers must ensure that the `ring_addr` and its length represented by `ring_entries`
745    /// are valid and will be valid until the bgid is unregistered or the ring destroyed,
746    /// otherwise undefined behaviour may occur.
747    #[deprecated(note = "please use `register_buf_ring_with_flags` instead")]
748    pub unsafe fn register_buf_ring(
749        &self,
750        ring_addr: u64,
751        ring_entries: u16,
752        bgid: u16,
753    ) -> io::Result<()> {
754        self.register_buf_ring_with_flags(ring_addr, ring_entries, bgid, 0)
755    }
756
757    /// Register buffer ring for provided buffers.
758    ///
759    /// Details can be found in the io_uring_register_buf_ring.3 man page.
760    ///
761    /// If the register command is not supported, or the ring_entries value exceeds
762    /// 32768, the InvalidInput error is returned.
763    ///
764    /// Available since 5.19.
765    ///
766    /// # Safety
767    ///
768    /// Developers must ensure that the `ring_addr` and its length represented by `ring_entries`
769    /// are valid and will be valid until the bgid is unregistered or the ring destroyed,
770    /// otherwise undefined behaviour may occur.
771    pub unsafe fn register_buf_ring_with_flags(
772        &self,
773        ring_addr: u64,
774        ring_entries: u16,
775        bgid: u16,
776        flags: u16,
777    ) -> io::Result<()> {
778        // The interface type for ring_entries is u32 but the same interface only allows a u16 for
779        // the tail to be specified, so to try and avoid further confusion, we limit the
780        // ring_entries to u16 here too. The value is actually limited to 2^15 (32768) but we can
781        // let the kernel enforce that.
782        let arg = sys::io_uring_buf_reg {
783            ring_addr,
784            ring_entries: ring_entries as _,
785            bgid,
786            flags,
787            ..Default::default()
788        };
789        execute(
790            self.fd.as_raw_fd(),
791            sys::IORING_REGISTER_PBUF_RING,
792            cast_ptr::<sys::io_uring_buf_reg>(&arg).cast(),
793            1,
794        )
795        .map(drop)
796    }
797
798    /// Unregister a previously registered buffer ring.
799    ///
800    /// Available since 5.19.
801    pub fn unregister_buf_ring(&self, bgid: u16) -> io::Result<()> {
802        let arg = sys::io_uring_buf_reg {
803            ring_addr: 0,
804            ring_entries: 0,
805            bgid,
806            ..Default::default()
807        };
808        execute(
809            self.fd.as_raw_fd(),
810            sys::IORING_UNREGISTER_PBUF_RING,
811            cast_ptr::<sys::io_uring_buf_reg>(&arg).cast(),
812            1,
813        )
814        .map(drop)
815    }
816
817    /// Performs a synchronous cancellation request, similar to [AsyncCancel](crate::opcode::AsyncCancel),
818    /// except that it completes synchronously.
819    ///
820    /// Cancellation can target a specific request, or all requests matching some criteria. The
821    /// [`CancelBuilder`] builder supports describing the match criteria for cancellation.
822    ///
823    /// An optional `timeout` can be provided to specify how long to wait for matched requests to be
824    /// canceled. If no timeout is provided, the default is to wait indefinitely.
825    ///
826    /// ### Errors
827    ///
828    /// If no requests are matched, returns:
829    ///
830    /// [io::ErrorKind::NotFound]: `No such file or directory (os error 2)`
831    ///
832    /// If a timeout is supplied, and the timeout elapses prior to all requests being canceled, returns:
833    ///
834    /// [io::ErrorKind::Uncategorized]: `Timer expired (os error 62)`
835    ///
836    /// ### Notes
837    ///
838    /// Only requests which have been submitted to the ring will be considered for cancellation. Requests
839    /// which have been written to the SQ, but not submitted, will not be canceled.
840    ///
841    /// Available since 6.0.
842    pub fn register_sync_cancel(
843        &self,
844        timeout: Option<Timespec>,
845        builder: CancelBuilder,
846    ) -> io::Result<()> {
847        let timespec = timeout.map(|ts| ts.0).unwrap_or(sys::__kernel_timespec {
848            tv_sec: -1,
849            tv_nsec: -1,
850        });
851        let user_data = builder.user_data.unwrap_or(0);
852        let flags = builder.flags.bits();
853        let fd = builder.to_fd();
854
855        let arg = sys::io_uring_sync_cancel_reg {
856            addr: user_data,
857            fd,
858            flags,
859            timeout: timespec,
860            ..Default::default()
861        };
862
863        execute(
864            self.fd.as_raw_fd(),
865            sys::IORING_REGISTER_SYNC_CANCEL,
866            cast_ptr::<sys::io_uring_sync_cancel_reg>(&arg).cast(),
867            1,
868        )
869        .map(drop)
870    }
871
872    /// Register a netdev hw rx queue for zerocopy.
873    ///
874    /// Available since 6.15.
875    pub fn register_ifq(&self, reg: &sys::io_uring_zcrx_ifq_reg) -> io::Result<()> {
876        execute(
877            self.fd.as_raw_fd(),
878            sys::IORING_REGISTER_ZCRX_IFQ,
879            cast_ptr::<sys::io_uring_zcrx_ifq_reg>(reg) as _,
880            1,
881        )
882        .map(drop)
883    }
884}