syd 3.52.0

rock-solid application kernel
Documentation
// Syd: rock-solid application kernel
// src/kernel/net/socket.rs: socket(2) and socketpair(2) handlers
//
// Copyright (c) 2025, 2026 Ali Polatel <alip@chesswob.org>
//
// SPDX-License-Identifier: GPL-3.0

// SAFETY: This module has been liberated from unsafe code!
#![forbid(unsafe_code)]

use std::os::fd::RawFd;

use libc::c_int;
use libseccomp::ScmpNotifResp;
use nix::{errno::Errno, sys::socket::SockFlag};

use crate::{
    cache::UnixVal,
    compat::{AddressFamily, SockType, AF_MAX, SOCK_TYPE_MASK},
    confine::is_valid_ptr,
    cookie::{safe_socket, safe_socketpair},
    ip::SocketCall,
    kernel::net::sandbox_addr_unnamed,
    req::UNotifyEventRequest,
    sandbox::{Flags, NetlinkFamily, Options, SandboxGuard},
};

pub(crate) fn handle_socket(
    request: &UNotifyEventRequest,
    args: &[u64; 6],
    flags: Flags,
    options: Options,
    netlink_families: NetlinkFamily,
) -> Result<ScmpNotifResp, Errno> {
    let allow_unsafe_socket = options.allow_unsafe_socket();
    let allow_unsupp_socket = options.allow_unsupp_socket();
    let allow_unsafe_kcapi = options.allow_unsafe_kcapi();
    let force_cloexec = flags.force_cloexec();
    let force_rand_fd = flags.force_rand_fd();

    // Linux truncates upper bits and rejects unknown flags.
    #[expect(clippy::cast_possible_truncation)]
    let stype = args[1] as c_int;
    let sflag = SockFlag::from_bits(stype & !SOCK_TYPE_MASK).ok_or(Errno::EINVAL)?;

    #[expect(clippy::cast_possible_truncation)]
    let domain = AddressFamily::from_raw(args[0] as c_int);

    // Linux validates address family first, socket type next.
    if !(0..AF_MAX).contains(&domain.as_raw()) {
        return Err(Errno::EAFNOSUPPORT);
    }
    let stype = match SockType::try_from(stype) {
        Err(Errno::EINVAL) => return Err(Errno::EINVAL),
        _ if domain == AddressFamily::Unspec => return Err(Errno::EAFNOSUPPORT),
        Err(errno) => return Err(errno),
        Ok(stype) => stype,
    };

    // Linux converts SOCK_RAW to SOCK_DGRAM for AF_UNIX sockets.
    let stype = if domain == AddressFamily::Unix && stype == SockType::Raw {
        SockType::Datagram
    } else {
        stype
    };

    #[expect(clippy::cast_possible_truncation)]
    let proto = args[2] as c_int;

    // Limit available domains based on sandbox flags.
    // Deny access to raw & packet sockets, unless
    // trace/allow_unsafe_socket:1 is set. Both types require
    // CAP_NET_RAW and use of SOCK_PACKET is strongly discouraged.
    if !allow_unsupp_socket {
        match domain {
            AddressFamily::Unix | AddressFamily::Inet | AddressFamily::Inet6 => {}
            AddressFamily::Alg if allow_unsafe_kcapi => {}
            AddressFamily::Netlink => {
                // Restrict AF_NETLINK to the allowlisted families.
                // Linux returns EPROTONOSUPPORT for denied/invalid netlink protocols.
                #[expect(clippy::cast_possible_truncation)]
                let nlfam = args[2] as i32;
                if !(0..=NetlinkFamily::max()).contains(&nlfam) {
                    return Err(Errno::EPROTONOSUPPORT);
                }
                let nlfam = NetlinkFamily::from_bits(1 << nlfam).ok_or(Errno::EPROTONOSUPPORT)?;
                if !netlink_families.contains(nlfam) {
                    // Unsafe netlink family, deny.
                    return Err(Errno::EPROTONOSUPPORT);
                }
            }
            AddressFamily::Packet if !allow_unsafe_socket => return Err(Errno::EACCES),
            AddressFamily::Packet => {}
            _ => return Err(Errno::EAFNOSUPPORT),
        }
    } else if !allow_unsafe_kcapi && domain == AddressFamily::Alg {
        return Err(Errno::EAFNOSUPPORT);
    } else if !allow_unsafe_socket
        && (domain == AddressFamily::Packet
            || (domain != AddressFamily::Netlink && stype.is_unsafe()))
    {
        return Err(Errno::EACCES);
    } else {
        // a. trace/allow_unsupp_socket:1
        // b. Safe domain, allow.
    }

    let cloexec = force_cloexec || sflag.contains(SockFlag::SOCK_CLOEXEC);
    let sflag = sflag | SockFlag::SOCK_CLOEXEC;

    // Record blocking call so it can get invalidated.
    let req = request.scmpreq;
    request.cache.add_sys_block(req, false)?;

    // All done, call underlying system call.
    let result = safe_socket(domain, stype, sflag, proto);

    // Remove invalidation record.
    request.cache.del_sys_block(req.id)?;

    // Check for errors after critical section.
    let fd = result?;

    request.send_fd(fd, cloexec, force_rand_fd)
}

pub(crate) fn handle_socketpair(
    request: &UNotifyEventRequest,
    sandbox: SandboxGuard,
    args: &[u64; 6],
    call: SocketCall,
) -> Result<ScmpNotifResp, Errno> {
    let flags = *sandbox.flags;
    let options = *sandbox.options;
    let force_cloexec = flags.force_cloexec();
    let force_rand_fd = flags.force_rand_fd();
    let allow_unsupp_socket = options.allow_unsupp_socket();

    // Linux truncates upper bits and rejects unknown flags.
    #[expect(clippy::cast_possible_truncation)]
    let stype = args[1] as c_int;
    let sflag = SockFlag::from_bits(stype & !SOCK_TYPE_MASK).ok_or(Errno::EINVAL)?;

    #[expect(clippy::cast_possible_truncation)]
    let domain = AddressFamily::from_raw(args[0] as c_int);

    // Linux validates address family first, socket type next.
    if !(0..AF_MAX).contains(&domain.as_raw()) {
        return Err(Errno::EAFNOSUPPORT);
    }
    let stype = match SockType::try_from(stype) {
        Err(Errno::EINVAL) => return Err(Errno::EINVAL),
        _ if domain == AddressFamily::Unspec => return Err(Errno::EAFNOSUPPORT),
        Err(errno) => return Err(errno),
        Ok(stype) => stype,
    };

    #[expect(clippy::cast_possible_truncation)]
    let proto = args[2] as c_int;

    // On Linux, the only supported domains for this call are AF_UNIX (or
    // synonymously, AF_LOCAL) and AF_TIPC (since Linux 4.12).
    //
    // Linux accepts SOCK_RAW for AF_UNIX and converts it to SOCK_DGRAM.
    let stype = if domain == AddressFamily::Unix && stype == SockType::Raw {
        SockType::Datagram
    } else {
        stype
    };

    // AF_UNIX requires access check for bind access.
    let check_access = match domain {
        AddressFamily::Unix if !matches!(proto, 0 | libc::AF_UNIX) => {
            return Err(Errno::EPROTONOSUPPORT)
        }
        AddressFamily::Unix => true,
        AddressFamily::Tipc if !allow_unsupp_socket => return Err(Errno::EOPNOTSUPP),
        _ => false,
    };

    // Check AF_UNIX sockets for bind access to dummy `!unnamed' path.
    if check_access {
        sandbox_addr_unnamed(request, &sandbox, call)?;
    }
    drop(sandbox); // release the read-lock.

    // Check pointer against mmap_min_addr.
    let fdptr = args[3];
    if !is_valid_ptr(fdptr, request.scmpreq.data.arch) {
        return Err(Errno::EFAULT);
    }

    let cloexec = force_cloexec || sflag.contains(SockFlag::SOCK_CLOEXEC);
    let sflag = sflag | SockFlag::SOCK_CLOEXEC;

    // Record blocking call so it can get invalidated.
    let req = request.scmpreq;
    request.cache.add_sys_block(req, false)?;

    // All done, call underlying system call.
    let result = safe_socketpair(domain, stype, proto, sflag);

    // Remove invalidation record.
    request.cache.del_sys_block(req.id)?;

    // Check for errors after critical section.
    let (fd0, fd1) = result?;

    // Ensure memory is writable before installing fds.
    //
    // This is best effort, we can still leak fds if page protections
    // change after this call but before the next write memory call.
    let out = [0u8; 2 * size_of::<RawFd>()];
    request.write_mem_all(&out, fdptr)?;

    // Handle UNIX map after successful socketpair(2) for UNIX sockets.
    if domain == AddressFamily::Unix {
        // Record inode->PID mappings to the UNIX map.
        // We ignore errors because there's nothing we can do about them.
        let _ = request.add_unix(&fd0, request.scmpreq.pid(), UnixVal::default());
        let _ = request.add_unix(&fd1, request.scmpreq.pid(), UnixVal::default());
    }

    // Install both fds into the sandbox process.
    // Move fds into the function and close on return.
    let newfd0 = request.add_fd(fd0, cloexec, force_rand_fd)?;
    let newfd1 = request.add_fd(fd1, cloexec, force_rand_fd)?;

    // Write the installed fds back to sandbox process memory.
    let a = newfd0.to_ne_bytes();
    let b = newfd1.to_ne_bytes();
    let out = [a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]];

    // The caller provided `fdptr`:
    // Write back exactly 2 * sizeof(RawFd) bytes.
    request.write_mem_all(&out, fdptr)?;

    // socketpair(2) returns 0 on success.
    Ok(request.return_syscall(0))
}