syd 3.52.0

rock-solid application kernel
Documentation
// Syd: rock-solid application kernel
// src/kernel/net/recvfrom.rs: recvfrom(2) handler
//
// Copyright (c) 2025, 2026 Ali Polatel <alip@chesswob.org>
//
// SPDX-License-Identifier: GPL-3.0

use std::os::fd::AsFd;

use libseccomp::ScmpNotifResp;
use nix::{
    errno::Errno,
    sys::socket::{SockaddrLike, SockaddrStorage},
};
use zeroize::Zeroizing;

use crate::{
    compat::{recv, recvfrom, MsgFlags},
    config::MAX_RW_COUNT,
    fd::{fd_inode, get_nonblock, has_recv_timeout, SafeOwnedFd},
    kernel::net::to_msgflags,
    req::UNotifyEventRequest,
    unix::unix_addr_len,
};

pub(crate) fn handle_recv(
    fd: SafeOwnedFd,
    args: &[u64; 6],
    request: &UNotifyEventRequest,
    restrict_oob: bool,
) -> Result<ScmpNotifResp, Errno> {
    // Truncate flags to 32-bit keeping unknown flags.
    let flags = to_msgflags(args[3]);

    // Reject MSG_OOB as necessary.
    if restrict_oob && flags.contains(MsgFlags::MSG_OOB) {
        // Signal no support to let the sandbox process handle the error
        // gracefully. This is consistent with the Linux kernel.
        return Err(Errno::EOPNOTSUPP);
    }

    // Length argument to the recv call must not be fully trusted, it
    // can be overly large, and allocating a Vector of that capacity may
    // overflow. It is valid for the length to be zero to receive an
    // empty message. Buffer read from kernel MUST be zeroized on drop.
    let len = usize::try_from(args[2])
        .or(Err(Errno::EINVAL))?
        .min(*MAX_RW_COUNT); // Cap count at MAX_RW_COUNT.
    let mut buf = Zeroizing::new(Vec::new());
    if len > 0 {
        buf.try_reserve(len).or(Err(Errno::ENOMEM))?;
        buf.resize(len, 0);
    }

    // Record blocking call so it can get invalidated.
    let req = request.scmpreq;
    let is_blocking = if !flags.contains(MsgFlags::MSG_DONTWAIT) && !get_nonblock(&fd)? {
        let ignore_restart = has_recv_timeout(&fd)?;

        // Record the blocking call.
        request.cache.add_sys_block(req, ignore_restart)?;

        true
    } else {
        false
    };

    // Perform recvmsg(2).
    let result = recv(&fd, &mut buf, flags);

    // Remove invalidation record.
    if is_blocking {
        request.cache.del_sys_block(req.id)?;
    }

    // Check for recv errors after invalidation.
    let n = result?;

    // Write buffer into sandbox process memory.
    request.write_mem_all(&buf[..n], args[1])?;

    #[expect(clippy::cast_possible_wrap)]
    Ok(request.return_syscall(n as i64))
}

pub(crate) fn handle_recvfrom(
    fd: SafeOwnedFd,
    args: &[u64; 6],
    request: &UNotifyEventRequest,
    restrict_oob: bool,
) -> Result<ScmpNotifResp, Errno> {
    // Connection-mode socket, use recv handler.
    if args[4] == 0 && args[5] == 0 {
        return handle_recv(fd, args, request, restrict_oob);
    }

    // Truncate flags to 32-bit keeping unknown flags.
    let flags = to_msgflags(args[3]);

    // Reject MSG_OOB as necessary.
    if restrict_oob && flags.contains(MsgFlags::MSG_OOB) {
        // Signal no support to let the sandbox process handle the error
        // gracefully. This is consistent with the Linux kernel.
        return Err(Errno::EOPNOTSUPP);
    }

    // Check whether we should block and ignore restarts.
    let (is_blocking, ignore_restart) =
        if !flags.contains(MsgFlags::MSG_DONTWAIT) && !get_nonblock(&fd)? {
            let ignore_restart = has_recv_timeout(&fd)?;
            (true, ignore_restart)
        } else {
            (false, false)
        };

    // Get receiver inode before fd is consumed by do_recvfrom.
    let maybe_ino = fd_inode(&fd).ok();

    // Do the recvfrom call.
    let (buf, mut addr) = do_recvfrom(fd, request, flags, args[2], is_blocking, ignore_restart)?;

    // Determine address length if specified.
    //
    // Linux validates address length after datagram is queued.
    let addrlen = if args[5] != 0 {
        const SIZEOF_SOCKLEN_T: usize = size_of::<libc::socklen_t>();
        let mut buf = [0u8; SIZEOF_SOCKLEN_T];
        if request.read_mem(&mut buf, args[5], SIZEOF_SOCKLEN_T)? == SIZEOF_SOCKLEN_T {
            // libc defines socklen_t as u32,
            // however we should check for negative values
            // and return EINVAL as necessary.
            let len = i32::from_ne_bytes(buf);
            let len = libc::socklen_t::try_from(len).or(Err(Errno::EINVAL))?;
            if len > 0 && args[4] == 0 {
                // address length is positive however address is NULL:
                // Return EINVAL and NOT EFAULT here, see LTP accept01 check.
                return Err(Errno::EINVAL);
            }
            len
        } else {
            // Linux returns EFAULT for invalid address length pointer.
            return Err(Errno::EFAULT);
        }
    } else {
        // Linux returns EFAULT when addr is non-NULL but addr_len is NULL.
        return Err(Errno::EFAULT);
    };

    // Change peer address as necessary for UNIX domain sockets.
    if let Some(ino) = maybe_ino {
        if let Some(peer_addr) = addr
            .as_ref()
            .and_then(|(addr, _)| request.resolve_unix_peer(addr, ino).ok())
        {
            let addr_len = peer_addr
                .as_unix_addr()
                .map_or(peer_addr.len(), unix_addr_len);
            addr = Some((peer_addr, addr_len));
        }
    }

    // Write buffer into sandbox process memory.
    let n = buf.len();
    request.write_mem_all(&buf, args[1])?;

    // Linux writes address length before address.
    //
    // Convert "len" into a vector of bytes.
    // This must be socklen_t and _not_ usize!
    let len = addr
        .as_ref()
        .map_or(0, |(_, addrlen_out)| *addrlen_out as libc::socklen_t);
    let buf = len.to_ne_bytes();

    // Write "len" into memory.
    request.write_mem_all(&buf, args[5])?;

    // Write address into sandbox process memory as necessary.
    // The address may be None for connection-mode sockets.
    if let Some((addr, addrlen_out)) = addr {
        // Create a byte slice from the socket address.
        //
        // SAFETY: SockaddrStorage is initialized; as_ptr() and len() return valid bounds.
        let buf =
            unsafe { std::slice::from_raw_parts(addr.as_ptr().cast::<u8>(), addr.len() as usize) };

        // Write the truncated socket address into memory.
        // Truncate late to avoid potential UB in std::slice::slice_from_raw_parts().
        let out_len = addrlen.min(addrlen_out) as usize;
        request.write_mem_all(&buf[..out_len], args[4])?;
    }

    #[expect(clippy::cast_possible_wrap)]
    Ok(request.return_syscall(n as i64))
}

#[expect(clippy::type_complexity)]
fn do_recvfrom<Fd: AsFd>(
    fd: Fd,
    request: &UNotifyEventRequest,
    flags: MsgFlags,
    len: u64,
    is_blocking: bool,
    ignore_restart: bool,
) -> Result<
    (
        Zeroizing<Vec<u8>>,
        Option<(SockaddrStorage, libc::socklen_t)>,
    ),
    Errno,
> {
    // Length argument to the recvfrom call must not be fully trusted,
    // it can be overly large, and allocating a Vector of that capacity
    // may overflow. It is valid for the length to be zero to receive an
    // empty message. Buffer read from kernel MUST be zeroized on drop.
    let len = usize::try_from(len)
        .or(Err(Errno::EINVAL))?
        .min(*MAX_RW_COUNT); // Cap count at MAX_RW_COUNT.
    let mut buf = Zeroizing::new(Vec::new());
    if len > 0 {
        buf.try_reserve(len).or(Err(Errno::ENOMEM))?;
        buf.resize(len, 0);
    }

    // Record blocking call so it can get invalidated.
    if is_blocking {
        request
            .cache
            .add_sys_block(request.scmpreq, ignore_restart)?;
    };

    // Perform recvmsg(2).
    let result = recvfrom(&fd, &mut buf, flags);

    // Remove invalidation record.
    if is_blocking {
        request.cache.del_sys_block(request.scmpreq.id)?;
    }

    // Check for recvfrom errors after invalidation.
    let (n, addr) = result?;

    // Truncate buffer to the received size.
    buf.truncate(n);

    Ok((buf, addr))
}