quinn_udp/
unix.rs

1#[cfg(not(any(apple, target_os = "openbsd", solarish)))]
2use std::ptr;
3use std::{
4    io::{self, IoSliceMut},
5    mem::{self, MaybeUninit},
6    net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr, SocketAddrV4, SocketAddrV6},
7    os::unix::io::AsRawFd,
8    sync::{
9        Mutex,
10        atomic::{AtomicBool, AtomicUsize, Ordering},
11    },
12    time::Instant,
13};
14
15use socket2::SockRef;
16
17use super::{
18    EcnCodepoint, IO_ERROR_LOG_INTERVAL, RecvMeta, Transmit, UdpSockRef, cmsg, log_sendmsg_error,
19};
20
21// Adapted from https://github.com/apple-oss-distributions/xnu/blob/8d741a5de7ff4191bf97d57b9f54c2f6d4a15585/bsd/sys/socket_private.h
22#[cfg(apple_fast)]
23#[repr(C)]
24#[allow(non_camel_case_types)]
25pub(crate) struct msghdr_x {
26    pub msg_name: *mut libc::c_void,
27    pub msg_namelen: libc::socklen_t,
28    pub msg_iov: *mut libc::iovec,
29    pub msg_iovlen: libc::c_int,
30    pub msg_control: *mut libc::c_void,
31    pub msg_controllen: libc::socklen_t,
32    pub msg_flags: libc::c_int,
33    pub msg_datalen: usize,
34}
35
36#[cfg(apple_fast)]
37extern "C" {
38    fn recvmsg_x(
39        s: libc::c_int,
40        msgp: *const msghdr_x,
41        cnt: libc::c_uint,
42        flags: libc::c_int,
43    ) -> isize;
44
45    fn sendmsg_x(
46        s: libc::c_int,
47        msgp: *const msghdr_x,
48        cnt: libc::c_uint,
49        flags: libc::c_int,
50    ) -> isize;
51}
52
53// Defined in netinet6/in6.h on OpenBSD, this is not yet exported by the libc crate
54// directly.  See https://github.com/rust-lang/libc/issues/3704 for when we might be able to
55// rely on this from the libc crate.
56#[cfg(any(target_os = "openbsd", target_os = "netbsd"))]
57const IPV6_DONTFRAG: libc::c_int = 62;
58#[cfg(not(any(target_os = "openbsd", target_os = "netbsd")))]
59const IPV6_DONTFRAG: libc::c_int = libc::IPV6_DONTFRAG;
60
61#[cfg(target_os = "freebsd")]
62type IpTosTy = libc::c_uchar;
63#[cfg(not(any(target_os = "freebsd", target_os = "netbsd")))]
64type IpTosTy = libc::c_int;
65
66/// Tokio-compatible UDP socket with some useful specializations.
67///
68/// Unlike a standard tokio UDP socket, this allows ECN bits to be read and written on some
69/// platforms.
70#[derive(Debug)]
71pub struct UdpSocketState {
72    last_send_error: Mutex<Instant>,
73    max_gso_segments: AtomicUsize,
74    gro_segments: usize,
75    may_fragment: bool,
76
77    /// True if we have received EINVAL error from `sendmsg` system call at least once.
78    ///
79    /// If enabled, we assume that old kernel is used and switch to fallback mode.
80    /// In particular, we do not use IP_TOS cmsg_type in this case,
81    /// which is not supported on Linux <3.13 and results in not sending the UDP packet at all.
82    sendmsg_einval: AtomicBool,
83}
84
85impl UdpSocketState {
86    pub fn new(sock: UdpSockRef<'_>) -> io::Result<Self> {
87        let io = sock.0;
88        let mut cmsg_platform_space = 0;
89        if cfg!(target_os = "linux")
90            || cfg!(bsd)
91            || cfg!(apple)
92            || cfg!(target_os = "android")
93            || cfg!(solarish)
94        {
95            cmsg_platform_space +=
96                unsafe { libc::CMSG_SPACE(mem::size_of::<libc::in6_pktinfo>() as _) as usize };
97        }
98
99        assert!(
100            CMSG_LEN
101                >= unsafe { libc::CMSG_SPACE(mem::size_of::<libc::c_int>() as _) as usize }
102                    + cmsg_platform_space
103        );
104        assert!(
105            mem::align_of::<libc::cmsghdr>() <= mem::align_of::<cmsg::Aligned<[u8; 0]>>(),
106            "control message buffers will be misaligned"
107        );
108
109        io.set_nonblocking(true)?;
110
111        let addr = io.local_addr()?;
112        let is_ipv4 = addr.family() == libc::AF_INET as libc::sa_family_t;
113
114        // mac and ios do not support IP_RECVTOS on dual-stack sockets :(
115        // older macos versions also don't have the flag and will error out if we don't ignore it
116        #[cfg(not(any(target_os = "openbsd", target_os = "netbsd", solarish)))]
117        if is_ipv4 || !io.only_v6()? {
118            if let Err(_err) =
119                set_socket_option(&*io, libc::IPPROTO_IP, libc::IP_RECVTOS, OPTION_ON)
120            {
121                crate::log::debug!("Ignoring error setting IP_RECVTOS on socket: {_err:?}");
122            }
123        }
124
125        let mut may_fragment = false;
126        #[cfg(any(target_os = "linux", target_os = "android"))]
127        {
128            // opportunistically try to enable GRO. See gro::gro_segments().
129            let _ = set_socket_option(&*io, libc::SOL_UDP, gro::UDP_GRO, OPTION_ON);
130
131            // Forbid IPv4 fragmentation. Set even for IPv6 to account for IPv6 mapped IPv4 addresses.
132            // Set `may_fragment` to `true` if this option is not supported on the platform.
133            may_fragment |= !set_socket_option_supported(
134                &*io,
135                libc::IPPROTO_IP,
136                libc::IP_MTU_DISCOVER,
137                libc::IP_PMTUDISC_PROBE,
138            )?;
139
140            if is_ipv4 {
141                set_socket_option(&*io, libc::IPPROTO_IP, libc::IP_PKTINFO, OPTION_ON)?;
142            } else {
143                // Set `may_fragment` to `true` if this option is not supported on the platform.
144                may_fragment |= !set_socket_option_supported(
145                    &*io,
146                    libc::IPPROTO_IPV6,
147                    libc::IPV6_MTU_DISCOVER,
148                    libc::IPV6_PMTUDISC_PROBE,
149                )?;
150            }
151        }
152        #[cfg(any(target_os = "freebsd", apple))]
153        {
154            if is_ipv4 {
155                // Set `may_fragment` to `true` if this option is not supported on the platform.
156                may_fragment |= !set_socket_option_supported(
157                    &*io,
158                    libc::IPPROTO_IP,
159                    libc::IP_DONTFRAG,
160                    OPTION_ON,
161                )?;
162            }
163        }
164        #[cfg(any(bsd, apple, solarish))]
165        // IP_RECVDSTADDR == IP_SENDSRCADDR on FreeBSD
166        // macOS uses only IP_RECVDSTADDR, no IP_SENDSRCADDR on macOS (the same on Solaris)
167        // macOS also supports IP_PKTINFO
168        {
169            if is_ipv4 {
170                set_socket_option(&*io, libc::IPPROTO_IP, libc::IP_RECVDSTADDR, OPTION_ON)?;
171            }
172        }
173
174        // Options standardized in RFC 3542
175        if !is_ipv4 {
176            set_socket_option(&*io, libc::IPPROTO_IPV6, libc::IPV6_RECVPKTINFO, OPTION_ON)?;
177            set_socket_option(&*io, libc::IPPROTO_IPV6, libc::IPV6_RECVTCLASS, OPTION_ON)?;
178            // Linux's IP_PMTUDISC_PROBE allows us to operate under interface MTU rather than the
179            // kernel's path MTU guess, but actually disabling fragmentation requires this too. See
180            // __ip6_append_data in ip6_output.c.
181            // Set `may_fragment` to `true` if this option is not supported on the platform.
182            may_fragment |=
183                !set_socket_option_supported(&*io, libc::IPPROTO_IPV6, IPV6_DONTFRAG, OPTION_ON)?;
184        }
185
186        let now = Instant::now();
187        Ok(Self {
188            last_send_error: Mutex::new(now.checked_sub(2 * IO_ERROR_LOG_INTERVAL).unwrap_or(now)),
189            max_gso_segments: AtomicUsize::new(gso::max_gso_segments()),
190            gro_segments: gro::gro_segments(),
191            may_fragment,
192            sendmsg_einval: AtomicBool::new(false),
193        })
194    }
195
196    /// Sends a [`Transmit`] on the given socket.
197    ///
198    /// This function will only ever return errors of kind [`io::ErrorKind::WouldBlock`].
199    /// All other errors will be logged and converted to `Ok`.
200    ///
201    /// UDP transmission errors are considered non-fatal because higher-level protocols must
202    /// employ retransmits and timeouts anyway in order to deal with UDP's unreliable nature.
203    /// Thus, logging is most likely the only thing you can do with these errors.
204    ///
205    /// If you would like to handle these errors yourself, use [`UdpSocketState::try_send`]
206    /// instead.
207    pub fn send(&self, socket: UdpSockRef<'_>, transmit: &Transmit<'_>) -> io::Result<()> {
208        match send(self, socket.0, transmit) {
209            Ok(()) => Ok(()),
210            Err(e) if e.kind() == io::ErrorKind::WouldBlock => Err(e),
211            // - EMSGSIZE is expected for MTU probes. Future work might be able to avoid
212            //   these by automatically clamping the MTUD upper bound to the interface MTU.
213            Err(e) if e.raw_os_error() == Some(libc::EMSGSIZE) => Ok(()),
214            Err(e) => {
215                log_sendmsg_error(&self.last_send_error, e, transmit);
216
217                Ok(())
218            }
219        }
220    }
221
222    /// Sends a [`Transmit`] on the given socket without any additional error handling.
223    pub fn try_send(&self, socket: UdpSockRef<'_>, transmit: &Transmit<'_>) -> io::Result<()> {
224        send(self, socket.0, transmit)
225    }
226
227    pub fn recv(
228        &self,
229        socket: UdpSockRef<'_>,
230        bufs: &mut [IoSliceMut<'_>],
231        meta: &mut [RecvMeta],
232    ) -> io::Result<usize> {
233        recv(socket.0, bufs, meta)
234    }
235
236    /// The maximum amount of segments which can be transmitted if a platform
237    /// supports Generic Send Offload (GSO).
238    ///
239    /// This is 1 if the platform doesn't support GSO. Subject to change if errors are detected
240    /// while using GSO.
241    #[inline]
242    pub fn max_gso_segments(&self) -> usize {
243        self.max_gso_segments.load(Ordering::Relaxed)
244    }
245
246    /// The number of segments to read when GRO is enabled. Used as a factor to
247    /// compute the receive buffer size.
248    ///
249    /// Returns 1 if the platform doesn't support GRO.
250    #[inline]
251    pub fn gro_segments(&self) -> usize {
252        self.gro_segments
253    }
254
255    /// Resize the send buffer of `socket` to `bytes`
256    #[inline]
257    pub fn set_send_buffer_size(&self, socket: UdpSockRef<'_>, bytes: usize) -> io::Result<()> {
258        socket.0.set_send_buffer_size(bytes)
259    }
260
261    /// Resize the receive buffer of `socket` to `bytes`
262    #[inline]
263    pub fn set_recv_buffer_size(&self, socket: UdpSockRef<'_>, bytes: usize) -> io::Result<()> {
264        socket.0.set_recv_buffer_size(bytes)
265    }
266
267    /// Get the size of the `socket` send buffer
268    #[inline]
269    pub fn send_buffer_size(&self, socket: UdpSockRef<'_>) -> io::Result<usize> {
270        socket.0.send_buffer_size()
271    }
272
273    /// Get the size of the `socket` receive buffer
274    #[inline]
275    pub fn recv_buffer_size(&self, socket: UdpSockRef<'_>) -> io::Result<usize> {
276        socket.0.recv_buffer_size()
277    }
278
279    /// Whether transmitted datagrams might get fragmented by the IP layer
280    ///
281    /// Returns `false` on targets which employ e.g. the `IPV6_DONTFRAG` socket option.
282    #[inline]
283    pub fn may_fragment(&self) -> bool {
284        self.may_fragment
285    }
286
287    /// Returns true if we previously got an EINVAL error from `sendmsg` syscall.
288    fn sendmsg_einval(&self) -> bool {
289        self.sendmsg_einval.load(Ordering::Relaxed)
290    }
291
292    /// Sets the flag indicating we got EINVAL error from `sendmsg` syscall.
293    #[cfg(not(any(apple, target_os = "openbsd", target_os = "netbsd")))]
294    fn set_sendmsg_einval(&self) {
295        self.sendmsg_einval.store(true, Ordering::Relaxed)
296    }
297}
298
299#[cfg(not(any(apple, target_os = "openbsd", target_os = "netbsd")))]
300fn send(
301    #[allow(unused_variables)] // only used on Linux
302    state: &UdpSocketState,
303    io: SockRef<'_>,
304    transmit: &Transmit<'_>,
305) -> io::Result<()> {
306    #[allow(unused_mut)] // only mutable on FreeBSD
307    let mut encode_src_ip = true;
308    #[cfg(target_os = "freebsd")]
309    {
310        let addr = io.local_addr()?;
311        let is_ipv4 = addr.family() == libc::AF_INET as libc::sa_family_t;
312        if is_ipv4 {
313            if let Some(socket) = addr.as_socket_ipv4() {
314                encode_src_ip = socket.ip() == &Ipv4Addr::UNSPECIFIED;
315            }
316        }
317    }
318    let mut msg_hdr: libc::msghdr = unsafe { mem::zeroed() };
319    let mut iovec: libc::iovec = unsafe { mem::zeroed() };
320    let mut cmsgs = cmsg::Aligned([0u8; CMSG_LEN]);
321    let dst_addr = socket2::SockAddr::from(transmit.destination);
322    prepare_msg(
323        transmit,
324        &dst_addr,
325        &mut msg_hdr,
326        &mut iovec,
327        &mut cmsgs,
328        encode_src_ip,
329        state.sendmsg_einval(),
330    );
331
332    loop {
333        let n = unsafe { libc::sendmsg(io.as_raw_fd(), &msg_hdr, 0) };
334
335        if n >= 0 {
336            return Ok(());
337        }
338
339        let e = io::Error::last_os_error();
340        match e.kind() {
341            // Retry the transmission
342            io::ErrorKind::Interrupted => continue,
343            io::ErrorKind::WouldBlock => return Err(e),
344            _ => {
345                // Some network adapters and drivers do not support GSO. Unfortunately, Linux
346                // offers no easy way for us to detect this short of an EIO or sometimes EINVAL
347                // when we try to actually send datagrams using it.
348                #[cfg(any(target_os = "linux", target_os = "android"))]
349                if let Some(libc::EIO) | Some(libc::EINVAL) = e.raw_os_error() {
350                    // Prevent new transmits from being scheduled using GSO. Existing GSO transmits
351                    // may already be in the pipeline, so we need to tolerate additional failures.
352                    if state.max_gso_segments() > 1 {
353                        crate::log::info!(
354                            "`libc::sendmsg` failed with {e}; halting segmentation offload"
355                        );
356                        state
357                            .max_gso_segments
358                            .store(1, std::sync::atomic::Ordering::Relaxed);
359                    }
360                }
361
362                // Some arguments to `sendmsg` are not supported. Switch to
363                // fallback mode and retry if we haven't already.
364                if e.raw_os_error() == Some(libc::EINVAL) && !state.sendmsg_einval() {
365                    state.set_sendmsg_einval();
366                    prepare_msg(
367                        transmit,
368                        &dst_addr,
369                        &mut msg_hdr,
370                        &mut iovec,
371                        &mut cmsgs,
372                        encode_src_ip,
373                        state.sendmsg_einval(),
374                    );
375                    continue;
376                }
377
378                return Err(e);
379            }
380        }
381    }
382}
383
384#[cfg(apple_fast)]
385fn send(state: &UdpSocketState, io: SockRef<'_>, transmit: &Transmit<'_>) -> io::Result<()> {
386    let mut hdrs = unsafe { mem::zeroed::<[msghdr_x; BATCH_SIZE]>() };
387    let mut iovs = unsafe { mem::zeroed::<[libc::iovec; BATCH_SIZE]>() };
388    let mut ctrls = [cmsg::Aligned([0u8; CMSG_LEN]); BATCH_SIZE];
389    let addr = socket2::SockAddr::from(transmit.destination);
390    let segment_size = transmit.segment_size.unwrap_or(transmit.contents.len());
391    let mut cnt = 0;
392    debug_assert!(transmit.contents.len().div_ceil(segment_size) <= BATCH_SIZE);
393    for (i, chunk) in transmit
394        .contents
395        .chunks(segment_size)
396        .enumerate()
397        .take(BATCH_SIZE)
398    {
399        prepare_msg(
400            &Transmit {
401                destination: transmit.destination,
402                ecn: transmit.ecn,
403                contents: chunk,
404                segment_size: Some(chunk.len()),
405                src_ip: transmit.src_ip,
406            },
407            &addr,
408            &mut hdrs[i],
409            &mut iovs[i],
410            &mut ctrls[i],
411            true,
412            state.sendmsg_einval(),
413        );
414        hdrs[i].msg_datalen = chunk.len();
415        cnt += 1;
416    }
417    loop {
418        let n = unsafe { sendmsg_x(io.as_raw_fd(), hdrs.as_ptr(), cnt as u32, 0) };
419
420        if n >= 0 {
421            return Ok(());
422        }
423
424        let e = io::Error::last_os_error();
425        match e.kind() {
426            // Retry the transmission
427            io::ErrorKind::Interrupted => continue,
428            _ => return Err(e),
429        }
430    }
431}
432
433#[cfg(any(target_os = "openbsd", target_os = "netbsd", apple_slow))]
434fn send(state: &UdpSocketState, io: SockRef<'_>, transmit: &Transmit<'_>) -> io::Result<()> {
435    let mut hdr: libc::msghdr = unsafe { mem::zeroed() };
436    let mut iov: libc::iovec = unsafe { mem::zeroed() };
437    let mut ctrl = cmsg::Aligned([0u8; CMSG_LEN]);
438    let addr = socket2::SockAddr::from(transmit.destination);
439    prepare_msg(
440        transmit,
441        &addr,
442        &mut hdr,
443        &mut iov,
444        &mut ctrl,
445        cfg!(apple) || cfg!(target_os = "openbsd") || cfg!(target_os = "netbsd"),
446        state.sendmsg_einval(),
447    );
448    loop {
449        let n = unsafe { libc::sendmsg(io.as_raw_fd(), &hdr, 0) };
450
451        if n >= 0 {
452            return Ok(());
453        }
454
455        let e = io::Error::last_os_error();
456        match e.kind() {
457            // Retry the transmission
458            io::ErrorKind::Interrupted => continue,
459            _ => return Err(e),
460        }
461    }
462}
463
464#[cfg(not(any(apple, target_os = "openbsd", target_os = "netbsd", solarish)))]
465fn recv(io: SockRef<'_>, bufs: &mut [IoSliceMut<'_>], meta: &mut [RecvMeta]) -> io::Result<usize> {
466    let mut names = [MaybeUninit::<libc::sockaddr_storage>::uninit(); BATCH_SIZE];
467    let mut ctrls = [cmsg::Aligned(MaybeUninit::<[u8; CMSG_LEN]>::uninit()); BATCH_SIZE];
468    let mut hdrs = unsafe { mem::zeroed::<[libc::mmsghdr; BATCH_SIZE]>() };
469    let max_msg_count = bufs.len().min(BATCH_SIZE);
470    for i in 0..max_msg_count {
471        prepare_recv(
472            &mut bufs[i],
473            &mut names[i],
474            &mut ctrls[i],
475            &mut hdrs[i].msg_hdr,
476        );
477    }
478    let msg_count = loop {
479        let n = unsafe {
480            libc::recvmmsg(
481                io.as_raw_fd(),
482                hdrs.as_mut_ptr(),
483                bufs.len().min(BATCH_SIZE) as _,
484                0,
485                ptr::null_mut::<libc::timespec>(),
486            )
487        };
488
489        if n >= 0 {
490            break n;
491        }
492
493        let e = io::Error::last_os_error();
494        match e.kind() {
495            // Retry receiving
496            io::ErrorKind::Interrupted => continue,
497            _ => return Err(e),
498        }
499    };
500    for i in 0..(msg_count as usize) {
501        meta[i] = decode_recv(&names[i], &hdrs[i].msg_hdr, hdrs[i].msg_len as usize);
502    }
503    Ok(msg_count as usize)
504}
505
506#[cfg(apple_fast)]
507fn recv(io: SockRef<'_>, bufs: &mut [IoSliceMut<'_>], meta: &mut [RecvMeta]) -> io::Result<usize> {
508    let mut names = [MaybeUninit::<libc::sockaddr_storage>::uninit(); BATCH_SIZE];
509    // MacOS 10.15 `recvmsg_x` does not override the `msghdr_x`
510    // `msg_controllen`. Thus, after the call to `recvmsg_x`, one does not know
511    // which control messages have been written to. To prevent reading
512    // uninitialized memory, do not use `MaybeUninit` for `ctrls`, instead
513    // initialize `ctrls` with `0`s. A control message of all `0`s is
514    // automatically skipped by `libc::CMSG_NXTHDR`.
515    let mut ctrls = [cmsg::Aligned([0u8; CMSG_LEN]); BATCH_SIZE];
516    let mut hdrs = unsafe { mem::zeroed::<[msghdr_x; BATCH_SIZE]>() };
517    let max_msg_count = bufs.len().min(BATCH_SIZE);
518    for i in 0..max_msg_count {
519        prepare_recv(&mut bufs[i], &mut names[i], &mut ctrls[i], &mut hdrs[i]);
520    }
521    let msg_count = loop {
522        let n = unsafe { recvmsg_x(io.as_raw_fd(), hdrs.as_mut_ptr(), max_msg_count as _, 0) };
523
524        if n >= 0 {
525            break n;
526        }
527
528        let e = io::Error::last_os_error();
529        match e.kind() {
530            // Retry receiving
531            io::ErrorKind::Interrupted => continue,
532            _ => return Err(e),
533        }
534    };
535    for i in 0..(msg_count as usize) {
536        meta[i] = decode_recv(&names[i], &hdrs[i], hdrs[i].msg_datalen as usize);
537    }
538    Ok(msg_count as usize)
539}
540
541#[cfg(any(target_os = "openbsd", target_os = "netbsd", solarish, apple_slow))]
542fn recv(io: SockRef<'_>, bufs: &mut [IoSliceMut<'_>], meta: &mut [RecvMeta]) -> io::Result<usize> {
543    let mut name = MaybeUninit::<libc::sockaddr_storage>::uninit();
544    let mut ctrl = cmsg::Aligned(MaybeUninit::<[u8; CMSG_LEN]>::uninit());
545    let mut hdr = unsafe { mem::zeroed::<libc::msghdr>() };
546    prepare_recv(&mut bufs[0], &mut name, &mut ctrl, &mut hdr);
547    let n = loop {
548        let n = unsafe { libc::recvmsg(io.as_raw_fd(), &mut hdr, 0) };
549
550        if hdr.msg_flags & libc::MSG_TRUNC != 0 {
551            continue;
552        }
553
554        if n >= 0 {
555            break n;
556        }
557
558        let e = io::Error::last_os_error();
559        match e.kind() {
560            // Retry receiving
561            io::ErrorKind::Interrupted => continue,
562            _ => return Err(e),
563        }
564    };
565    meta[0] = decode_recv(&name, &hdr, n as usize);
566    Ok(1)
567}
568
569const CMSG_LEN: usize = 88;
570
571fn prepare_msg(
572    transmit: &Transmit<'_>,
573    dst_addr: &socket2::SockAddr,
574    #[cfg(not(apple_fast))] hdr: &mut libc::msghdr,
575    #[cfg(apple_fast)] hdr: &mut msghdr_x,
576    iov: &mut libc::iovec,
577    ctrl: &mut cmsg::Aligned<[u8; CMSG_LEN]>,
578    #[allow(unused_variables)] // only used on FreeBSD & macOS
579    encode_src_ip: bool,
580    sendmsg_einval: bool,
581) {
582    iov.iov_base = transmit.contents.as_ptr() as *const _ as *mut _;
583    iov.iov_len = transmit.contents.len();
584
585    // SAFETY: Casting the pointer to a mutable one is legal,
586    // as sendmsg is guaranteed to not alter the mutable pointer
587    // as per the POSIX spec. See the section on the sys/socket.h
588    // header for details. The type is only mutable in the first
589    // place because it is reused by recvmsg as well.
590    let name = dst_addr.as_ptr() as *mut libc::c_void;
591    let namelen = dst_addr.len();
592    hdr.msg_name = name as *mut _;
593    hdr.msg_namelen = namelen;
594    hdr.msg_iov = iov;
595    hdr.msg_iovlen = 1;
596
597    hdr.msg_control = ctrl.0.as_mut_ptr() as _;
598    hdr.msg_controllen = CMSG_LEN as _;
599    let mut encoder = unsafe { cmsg::Encoder::new(hdr) };
600    let ecn = transmit.ecn.map_or(0, |x| x as libc::c_int);
601    // True for IPv4 or IPv4-Mapped IPv6
602    let is_ipv4 = transmit.destination.is_ipv4()
603        || matches!(transmit.destination.ip(), IpAddr::V6(addr) if addr.to_ipv4_mapped().is_some());
604    if is_ipv4 {
605        if !sendmsg_einval {
606            #[cfg(not(target_os = "netbsd"))]
607            {
608                encoder.push(libc::IPPROTO_IP, libc::IP_TOS, ecn as IpTosTy);
609            }
610        }
611    } else {
612        encoder.push(libc::IPPROTO_IPV6, libc::IPV6_TCLASS, ecn);
613    }
614
615    // Only set the segment size if it is less than the size of the contents.
616    // Some network drivers don't like being told to do GSO even if there is effectively only a single segment (i.e. `segment_size == transmit.contents.len()`)
617    // Additionally, a `segment_size` that is greater than the content also means there is effectively only a single segment.
618    // This case is actually quite common when splitting up a prepared GSO batch again after GSO has been disabled because the last datagram in a GSO batch is allowed to be smaller than the segment size.
619    if let Some(segment_size) = transmit
620        .segment_size
621        .filter(|segment_size| *segment_size < transmit.contents.len())
622    {
623        gso::set_segment_size(&mut encoder, segment_size as u16);
624    }
625
626    if let Some(ip) = &transmit.src_ip {
627        match ip {
628            IpAddr::V4(v4) => {
629                #[cfg(any(target_os = "linux", target_os = "android"))]
630                {
631                    let pktinfo = libc::in_pktinfo {
632                        ipi_ifindex: 0,
633                        ipi_spec_dst: libc::in_addr {
634                            s_addr: u32::from_ne_bytes(v4.octets()),
635                        },
636                        ipi_addr: libc::in_addr { s_addr: 0 },
637                    };
638                    encoder.push(libc::IPPROTO_IP, libc::IP_PKTINFO, pktinfo);
639                }
640                #[cfg(any(bsd, apple, solarish))]
641                {
642                    if encode_src_ip {
643                        let addr = libc::in_addr {
644                            s_addr: u32::from_ne_bytes(v4.octets()),
645                        };
646                        encoder.push(libc::IPPROTO_IP, libc::IP_RECVDSTADDR, addr);
647                    }
648                }
649            }
650            IpAddr::V6(v6) => {
651                let pktinfo = libc::in6_pktinfo {
652                    ipi6_ifindex: 0,
653                    ipi6_addr: libc::in6_addr {
654                        s6_addr: v6.octets(),
655                    },
656                };
657                encoder.push(libc::IPPROTO_IPV6, libc::IPV6_PKTINFO, pktinfo);
658            }
659        }
660    }
661
662    encoder.finish();
663}
664
665#[cfg(not(apple_fast))]
666fn prepare_recv(
667    buf: &mut IoSliceMut,
668    name: &mut MaybeUninit<libc::sockaddr_storage>,
669    ctrl: &mut cmsg::Aligned<MaybeUninit<[u8; CMSG_LEN]>>,
670    hdr: &mut libc::msghdr,
671) {
672    hdr.msg_name = name.as_mut_ptr() as _;
673    hdr.msg_namelen = mem::size_of::<libc::sockaddr_storage>() as _;
674    hdr.msg_iov = buf as *mut IoSliceMut as *mut libc::iovec;
675    hdr.msg_iovlen = 1;
676    hdr.msg_control = ctrl.0.as_mut_ptr() as _;
677    hdr.msg_controllen = CMSG_LEN as _;
678    hdr.msg_flags = 0;
679}
680
681#[cfg(apple_fast)]
682fn prepare_recv(
683    buf: &mut IoSliceMut,
684    name: &mut MaybeUninit<libc::sockaddr_storage>,
685    ctrl: &mut cmsg::Aligned<[u8; CMSG_LEN]>,
686    hdr: &mut msghdr_x,
687) {
688    hdr.msg_name = name.as_mut_ptr() as _;
689    hdr.msg_namelen = mem::size_of::<libc::sockaddr_storage>() as _;
690    hdr.msg_iov = buf as *mut IoSliceMut as *mut libc::iovec;
691    hdr.msg_iovlen = 1;
692    hdr.msg_control = ctrl.0.as_mut_ptr() as _;
693    hdr.msg_controllen = CMSG_LEN as _;
694    hdr.msg_flags = 0;
695    hdr.msg_datalen = buf.len();
696}
697
698fn decode_recv(
699    name: &MaybeUninit<libc::sockaddr_storage>,
700    #[cfg(not(apple_fast))] hdr: &libc::msghdr,
701    #[cfg(apple_fast)] hdr: &msghdr_x,
702    len: usize,
703) -> RecvMeta {
704    let name = unsafe { name.assume_init() };
705    let mut ecn_bits = 0;
706    let mut dst_ip = None;
707    #[allow(unused_mut)] // only mutable on Linux
708    let mut stride = len;
709
710    let cmsg_iter = unsafe { cmsg::Iter::new(hdr) };
711    for cmsg in cmsg_iter {
712        match (cmsg.cmsg_level, cmsg.cmsg_type) {
713            (libc::IPPROTO_IP, libc::IP_TOS) => unsafe {
714                ecn_bits = cmsg::decode::<u8, libc::cmsghdr>(cmsg);
715            },
716            // FreeBSD uses IP_RECVTOS here, and we can be liberal because cmsgs are opt-in.
717            #[cfg(not(any(target_os = "openbsd", target_os = "netbsd", solarish)))]
718            (libc::IPPROTO_IP, libc::IP_RECVTOS) => unsafe {
719                ecn_bits = cmsg::decode::<u8, libc::cmsghdr>(cmsg);
720            },
721            (libc::IPPROTO_IPV6, libc::IPV6_TCLASS) => unsafe {
722                // Temporary hack around broken macos ABI. Remove once upstream fixes it.
723                // https://bugreport.apple.com/web/?problemID=48761855
724                #[allow(clippy::unnecessary_cast)] // cmsg.cmsg_len defined as size_t
725                if cfg!(apple)
726                    && cmsg.cmsg_len as usize == libc::CMSG_LEN(mem::size_of::<u8>() as _) as usize
727                {
728                    ecn_bits = cmsg::decode::<u8, libc::cmsghdr>(cmsg);
729                } else {
730                    ecn_bits = cmsg::decode::<libc::c_int, libc::cmsghdr>(cmsg) as u8;
731                }
732            },
733            #[cfg(any(target_os = "linux", target_os = "android"))]
734            (libc::IPPROTO_IP, libc::IP_PKTINFO) => {
735                let pktinfo = unsafe { cmsg::decode::<libc::in_pktinfo, libc::cmsghdr>(cmsg) };
736                dst_ip = Some(IpAddr::V4(Ipv4Addr::from(
737                    pktinfo.ipi_addr.s_addr.to_ne_bytes(),
738                )));
739            }
740            #[cfg(any(bsd, apple))]
741            (libc::IPPROTO_IP, libc::IP_RECVDSTADDR) => {
742                let in_addr = unsafe { cmsg::decode::<libc::in_addr, libc::cmsghdr>(cmsg) };
743                dst_ip = Some(IpAddr::V4(Ipv4Addr::from(in_addr.s_addr.to_ne_bytes())));
744            }
745            (libc::IPPROTO_IPV6, libc::IPV6_PKTINFO) => {
746                let pktinfo = unsafe { cmsg::decode::<libc::in6_pktinfo, libc::cmsghdr>(cmsg) };
747                dst_ip = Some(IpAddr::V6(Ipv6Addr::from(pktinfo.ipi6_addr.s6_addr)));
748            }
749            #[cfg(any(target_os = "linux", target_os = "android"))]
750            (libc::SOL_UDP, gro::UDP_GRO) => unsafe {
751                stride = cmsg::decode::<libc::c_int, libc::cmsghdr>(cmsg) as usize;
752            },
753            _ => {}
754        }
755    }
756
757    let addr = match libc::c_int::from(name.ss_family) {
758        libc::AF_INET => {
759            // Safety: if the ss_family field is AF_INET then storage must be a sockaddr_in.
760            let addr: &libc::sockaddr_in =
761                unsafe { &*(&name as *const _ as *const libc::sockaddr_in) };
762            SocketAddr::V4(SocketAddrV4::new(
763                Ipv4Addr::from(addr.sin_addr.s_addr.to_ne_bytes()),
764                u16::from_be(addr.sin_port),
765            ))
766        }
767        libc::AF_INET6 => {
768            // Safety: if the ss_family field is AF_INET6 then storage must be a sockaddr_in6.
769            let addr: &libc::sockaddr_in6 =
770                unsafe { &*(&name as *const _ as *const libc::sockaddr_in6) };
771            SocketAddr::V6(SocketAddrV6::new(
772                Ipv6Addr::from(addr.sin6_addr.s6_addr),
773                u16::from_be(addr.sin6_port),
774                addr.sin6_flowinfo,
775                addr.sin6_scope_id,
776            ))
777        }
778        _ => unreachable!(),
779    };
780
781    RecvMeta {
782        len,
783        stride,
784        addr,
785        ecn: EcnCodepoint::from_bits(ecn_bits),
786        dst_ip,
787    }
788}
789
790#[cfg(not(apple_slow))]
791// Chosen somewhat arbitrarily; might benefit from additional tuning.
792pub(crate) const BATCH_SIZE: usize = 32;
793
794#[cfg(apple_slow)]
795pub(crate) const BATCH_SIZE: usize = 1;
796
797#[cfg(any(target_os = "linux", target_os = "android"))]
798mod gso {
799    use super::*;
800
801    #[cfg(not(target_os = "android"))]
802    const UDP_SEGMENT: libc::c_int = libc::UDP_SEGMENT;
803    #[cfg(target_os = "android")]
804    // TODO: Add this to libc
805    const UDP_SEGMENT: libc::c_int = 103;
806
807    /// Checks whether GSO support is available by setting the UDP_SEGMENT
808    /// option on a socket
809    pub(crate) fn max_gso_segments() -> usize {
810        const GSO_SIZE: libc::c_int = 1500;
811
812        let socket = match std::net::UdpSocket::bind("[::]:0")
813            .or_else(|_| std::net::UdpSocket::bind((Ipv4Addr::LOCALHOST, 0)))
814        {
815            Ok(socket) => socket,
816            Err(_) => return 1,
817        };
818
819        // As defined in linux/udp.h
820        // #define UDP_MAX_SEGMENTS        (1 << 6UL)
821        match set_socket_option(&socket, libc::SOL_UDP, UDP_SEGMENT, GSO_SIZE) {
822            Ok(()) => 64,
823            Err(_e) => {
824                crate::log::debug!(
825                    "failed to set `UDP_SEGMENT` socket option ({_e}); setting `max_gso_segments = 1`"
826                );
827
828                1
829            }
830        }
831    }
832
833    pub(crate) fn set_segment_size(encoder: &mut cmsg::Encoder<libc::msghdr>, segment_size: u16) {
834        encoder.push(libc::SOL_UDP, UDP_SEGMENT, segment_size);
835    }
836}
837
838// On Apple platforms using the `sendmsg_x` call, UDP datagram segmentation is not
839// offloaded to the NIC or even the kernel, but instead done here in user space in
840// [`send`]) and then passed to the OS as individual `iovec`s (up to `BATCH_SIZE`).
841#[cfg(not(any(target_os = "linux", target_os = "android")))]
842mod gso {
843    use super::*;
844
845    pub(super) fn max_gso_segments() -> usize {
846        #[cfg(apple_fast)]
847        {
848            BATCH_SIZE
849        }
850        #[cfg(not(apple_fast))]
851        {
852            1
853        }
854    }
855
856    pub(super) fn set_segment_size(
857        #[cfg(not(apple_fast))] _encoder: &mut cmsg::Encoder<libc::msghdr>,
858        #[cfg(apple_fast)] _encoder: &mut cmsg::Encoder<msghdr_x>,
859        _segment_size: u16,
860    ) {
861    }
862}
863
864#[cfg(any(target_os = "linux", target_os = "android"))]
865mod gro {
866    use super::*;
867
868    #[cfg(not(target_os = "android"))]
869    pub(crate) const UDP_GRO: libc::c_int = libc::UDP_GRO;
870    #[cfg(target_os = "android")]
871    // TODO: Add this to libc
872    pub(crate) const UDP_GRO: libc::c_int = 104;
873
874    pub(crate) fn gro_segments() -> usize {
875        let socket = match std::net::UdpSocket::bind("[::]:0")
876            .or_else(|_| std::net::UdpSocket::bind((Ipv4Addr::LOCALHOST, 0)))
877        {
878            Ok(socket) => socket,
879            Err(_) => return 1,
880        };
881
882        // As defined in net/ipv4/udp_offload.c
883        // #define UDP_GRO_CNT_MAX 64
884        //
885        // NOTE: this MUST be set to UDP_GRO_CNT_MAX to ensure that the receive buffer size
886        // (get_max_udp_payload_size() * gro_segments()) is large enough to hold the largest GRO
887        // list the kernel might potentially produce. See
888        // https://github.com/quinn-rs/quinn/pull/1354.
889        match set_socket_option(&socket, libc::SOL_UDP, UDP_GRO, OPTION_ON) {
890            Ok(()) => 64,
891            Err(_) => 1,
892        }
893    }
894}
895
896/// Returns whether the given socket option is supported on the current platform
897///
898/// Yields `Ok(true)` if the option was set successfully, `Ok(false)` if setting
899/// the option raised an `ENOPROTOOPT` error, and `Err` for any other error.
900fn set_socket_option_supported(
901    socket: &impl AsRawFd,
902    level: libc::c_int,
903    name: libc::c_int,
904    value: libc::c_int,
905) -> io::Result<bool> {
906    match set_socket_option(socket, level, name, value) {
907        Ok(()) => Ok(true),
908        Err(err) if err.raw_os_error() == Some(libc::ENOPROTOOPT) => Ok(false),
909        Err(err) => Err(err),
910    }
911}
912
913fn set_socket_option(
914    socket: &impl AsRawFd,
915    level: libc::c_int,
916    name: libc::c_int,
917    value: libc::c_int,
918) -> io::Result<()> {
919    let rc = unsafe {
920        libc::setsockopt(
921            socket.as_raw_fd(),
922            level,
923            name,
924            &value as *const _ as _,
925            mem::size_of_val(&value) as _,
926        )
927    };
928
929    match rc == 0 {
930        true => Ok(()),
931        false => Err(io::Error::last_os_error()),
932    }
933}
934
935const OPTION_ON: libc::c_int = 1;
936
937#[cfg(not(any(target_os = "linux", target_os = "android")))]
938mod gro {
939    pub(super) fn gro_segments() -> usize {
940        1
941    }
942}