syd 3.52.0

rock-solid application kernel
Documentation
//
// Syd: rock-solid application kernel
// src/kernel/memfd.rs: memfd_create(2) handler
//
// Copyright (c) 2023, 2024, 2025, 2026 Ali Polatel <alip@chesswob.org>
//
// SPDX-License-Identifier: GPL-3.0

// SAFETY: This module has been liberated from unsafe code!
#![forbid(unsafe_code)]

use std::ffi::CString;

use libseccomp::ScmpNotifResp;
use memchr::{arch::all::is_prefix, memchr};
use nix::errno::Errno;

use crate::{
    compat::{MFdFlags, SecretMemFlags},
    config::{HAVE_MFD_NOEXEC_SEAL, MFD_HUGETLB_NAME_PREFIX, MFD_NAME_PREFIX, MFD_SECRET_NAME},
    confine::is_valid_ptr,
    cookie::{safe_memfd_create, safe_memfd_secret},
    kernel::sandbox_path,
    path::{XPath, XPathBuf},
    req::UNotifyEventRequest,
    sandbox::Capability,
};

pub(crate) fn sys_memfd_create(request: UNotifyEventRequest) -> ScmpNotifResp {
    syscall_handler!(request, |request: UNotifyEventRequest| {
        const NAME_MAX: usize = 255;
        // The slash is not included in the limit.
        const MFD_NAME_PREFIX_LEN: usize = MFD_NAME_PREFIX.len() - 1;
        const MFD_NAME_MAX_LEN: usize = NAME_MAX - MFD_NAME_PREFIX_LEN;

        let req = request.scmpreq;
        let addr = req.data.args[0];
        let flags = req.data.args[1];

        // Validate flags argument first.
        let mut flags = to_mfdflags(flags)?;

        // Validate name argument next.
        if !is_valid_ptr(addr, req.data.arch) {
            // Return EFAULT for invalid pointer.
            return Err(Errno::EFAULT);
        }

        // If sandboxing for create capability is off, return immediately.
        let sandbox = request.get_sandbox();
        let force_cloexec = sandbox.flags.force_cloexec();
        let force_rand_fd = sandbox.flags.force_rand_fd();
        let restrict_memfd = !sandbox.flags.allow_unsafe_memfd();

        // Drop the executable flag and seal as nonexecutable,
        // unless trace/allow_unsafe_memfd:1 is set.
        if restrict_memfd && *HAVE_MFD_NOEXEC_SEAL {
            flags.remove(MFdFlags::MFD_EXEC);
            flags.insert(MFdFlags::MFD_NOEXEC_SEAL);
        }

        // If buffer has no null byte, return EINVAL.
        // Return EFAULT for partial reads.
        const MFD_NAME_LEN: usize = MFD_NAME_MAX_LEN + 1 /* NUL byte */;
        let mut buf = request.read_vec(addr, MFD_NAME_LEN)?;
        let null = match memchr(0, &buf) {
            Some(null) => null.checked_add(1).ok_or(Errno::EINVAL)?,
            None if buf.len() < MFD_NAME_LEN => return Err(Errno::EFAULT),
            None => return Err(Errno::EINVAL),
        };
        buf.truncate(null);
        buf.shrink_to_fit();

        // If name starts with `syd', turn into `Syd'.
        // The `syd' prefix is used internally.
        // We don't return EINVAL here for stealth.
        if is_prefix(&buf, b"syd") {
            buf[0] = b'S';
        }
        let name = CString::from_vec_with_nul(buf).or(Err(Errno::EFAULT))?;

        // Determine sandbox capabilities.
        let mut caps = Capability::CAP_CREATE;
        if !flags.contains(MFdFlags::MFD_NOEXEC_SEAL) {
            caps.insert(Capability::CAP_EXEC);
        }

        if sandbox.enabled_any(caps) {
            // Check for access by appending the memfd prefix.
            let mut path = XPathBuf::from(if flags.contains(MFdFlags::MFD_HUGETLB) {
                // !memfd-hugetlb:
                MFD_HUGETLB_NAME_PREFIX
            } else {
                // !memfd:
                MFD_NAME_PREFIX
            });
            path.append_bytes(name.as_bytes());

            sandbox_path(
                Some(&request),
                &sandbox,
                request.scmpreq.pid(), // Unused when request.is_some()
                &path,
                caps,
                "memfd_create",
            )
            .or(Err(Errno::EACCES))?;
        }
        drop(sandbox); // release the read-lock.

        // Set CLOEXEC for our fd always, and
        // Set CLOEXEC for remote fd as necessary.
        let cloexec = force_cloexec || flags.contains(MFdFlags::MFD_CLOEXEC);
        flags.insert(MFdFlags::MFD_CLOEXEC);

        // Access granted, emulate call.
        let fd = safe_memfd_create(name.as_c_str(), flags)?;

        // Return the fd to the sandbox process.
        request.send_fd(fd, cloexec, force_rand_fd)
    })
}

pub(crate) fn sys_memfd_secret(request: UNotifyEventRequest) -> ScmpNotifResp {
    syscall_handler!(request, |request: UNotifyEventRequest| {
        // Validate flags argument first.
        let req = request.scmpreq;
        let flags = req.data.args[0];
        let mut flags = to_smflags(flags)?;

        // If sandboxing for create capability is off, return immediately.
        let sandbox = request.get_sandbox();
        let force_cloexec = sandbox.flags.force_cloexec();
        let force_rand_fd = sandbox.flags.force_rand_fd();

        // Check for sandbox access.
        if sandbox.enabled_any(Capability::CAP_CREATE) {
            // Check for access using !secretmem keyword.
            sandbox_path(
                Some(&request),
                &sandbox,
                request.scmpreq.pid(), // Unused when request.is_some()
                XPath::from_bytes(MFD_SECRET_NAME),
                Capability::CAP_CREATE,
                "memfd_secret",
            )
            .or(Err(Errno::EACCES))?;
        }
        drop(sandbox); // release the read-lock.

        // Set CLOEXEC for our fd always, and
        // Set CLOEXEC for remote fd as necessary.
        let cloexec = force_cloexec || flags.contains(SecretMemFlags::SM_CLOEXEC);
        flags.insert(SecretMemFlags::SM_CLOEXEC);

        // Access granted, emulate call.
        let fd = safe_memfd_secret(flags)?;

        // Return the fd to the sandbox process.
        request.send_fd(fd, cloexec, force_rand_fd)
    })
}

// Convert system call argument to MFdFlags safely.
#[inline]
fn to_mfdflags(arg: u64) -> Result<MFdFlags, Errno> {
    const MFD_ALL_FLAGS: libc::c_uint = libc::MFD_CLOEXEC
        | libc::MFD_ALLOW_SEALING
        | libc::MFD_HUGETLB
        | libc::MFD_NOEXEC_SEAL
        | libc::MFD_EXEC;

    // Linux kernel truncates upper bits.
    #[expect(clippy::cast_possible_truncation)]
    let flags = arg as libc::c_uint;

    // Reject invalid flags.
    if flags & libc::MFD_HUGETLB == 0 {
        if flags & !MFD_ALL_FLAGS != 0 {
            return Err(Errno::EINVAL);
        }
    } else {
        // Allow huge page size encoding in flags.
        if flags & !(MFD_ALL_FLAGS | ((libc::MFD_HUGE_MASK) << libc::MFD_HUGE_SHIFT)) != 0 {
            return Err(Errno::EINVAL);
        }
    }

    // Linux<6.3: Reject both MFD_EXEC and MFD_NOEXEC_SEAL.
    // Linux>=6.3: Reject when both are specified together.
    if *HAVE_MFD_NOEXEC_SEAL {
        if flags & libc::MFD_EXEC != 0 && flags & libc::MFD_NOEXEC_SEAL != 0 {
            return Err(Errno::EINVAL);
        }
    } else if flags & (libc::MFD_EXEC | libc::MFD_NOEXEC_SEAL) != 0 {
        return Err(Errno::EINVAL);
    }

    Ok(MFdFlags::from_bits_retain(flags))
}

// Convert system call argument to SecretMemFlags safely.
#[inline]
fn to_smflags(arg: u64) -> Result<SecretMemFlags, Errno> {
    // Linux kernel truncates upper bits.
    #[expect(clippy::cast_possible_truncation)]
    SecretMemFlags::from_bits(arg as libc::c_uint).ok_or(Errno::EINVAL)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_to_mfdflags_0() {
        assert_eq!(to_mfdflags(0), Ok(MFdFlags::empty()));
    }

    #[test]
    fn test_to_mfdflags_1() {
        assert_eq!(
            to_mfdflags(libc::MFD_CLOEXEC as u64),
            Ok(MFdFlags::MFD_CLOEXEC),
        );
    }

    #[test]
    fn test_to_mfdflags_2() {
        assert_eq!(
            to_mfdflags(libc::MFD_ALLOW_SEALING as u64),
            Ok(MFdFlags::MFD_ALLOW_SEALING),
        );
    }

    #[test]
    fn test_to_mfdflags_3() {
        if *HAVE_MFD_NOEXEC_SEAL {
            assert_eq!(to_mfdflags(libc::MFD_EXEC as u64), Ok(MFdFlags::MFD_EXEC),);
        } else {
            assert_eq!(to_mfdflags(libc::MFD_EXEC as u64), Err(Errno::EINVAL));
        }
    }

    #[test]
    fn test_to_mfdflags_4() {
        if *HAVE_MFD_NOEXEC_SEAL {
            assert_eq!(
                to_mfdflags(libc::MFD_NOEXEC_SEAL as u64),
                Ok(MFdFlags::MFD_NOEXEC_SEAL),
            );
        } else {
            assert_eq!(
                to_mfdflags(libc::MFD_NOEXEC_SEAL as u64),
                Err(Errno::EINVAL),
            );
        }
    }

    #[test]
    fn test_to_mfdflags_5() {
        if *HAVE_MFD_NOEXEC_SEAL {
            assert_eq!(
                to_mfdflags((libc::MFD_EXEC | libc::MFD_NOEXEC_SEAL) as u64),
                Err(Errno::EINVAL),
            );
        }
    }

    #[test]
    fn test_to_mfdflags_6() {
        assert_eq!(
            to_mfdflags((libc::MFD_HUGETLB as u64) | (libc::MFD_HUGE_2MB as u64)),
            Ok(MFdFlags::MFD_HUGETLB | MFdFlags::MFD_HUGE_2MB),
        );
    }

    #[test]
    fn test_to_mfdflags_7() {
        assert_eq!(
            to_mfdflags((libc::MFD_CLOEXEC | libc::MFD_ALLOW_SEALING | libc::MFD_HUGETLB) as u64),
            Ok(MFdFlags::MFD_CLOEXEC | MFdFlags::MFD_ALLOW_SEALING | MFdFlags::MFD_HUGETLB),
        );
    }

    #[test]
    fn test_to_mfdflags_8() {
        assert_eq!(to_mfdflags(1u64 << 33), Ok(MFdFlags::empty()));
    }

    #[test]
    fn test_to_mfdflags_9() {
        assert_eq!(
            to_mfdflags(0xFFFF_FFFF_8800_0004u64),
            Ok(MFdFlags::MFD_HUGETLB | MFdFlags::MFD_HUGE_16GB),
        );
    }

    #[test]
    fn test_to_mfdflags_10() {
        assert_eq!(to_mfdflags(0x20), Err(Errno::EINVAL));
    }

    #[test]
    fn test_to_mfdflags_11() {
        assert_eq!(to_mfdflags(libc::MFD_HUGE_2MB as u64), Err(Errno::EINVAL),);
    }

    #[test]
    fn test_to_smflags_0() {
        assert_eq!(to_smflags(0), Ok(SecretMemFlags::empty()));
    }

    #[test]
    fn test_to_smflags_1() {
        assert_eq!(
            to_smflags(libc::O_CLOEXEC as u64),
            Ok(SecretMemFlags::SM_CLOEXEC),
        );
    }

    #[test]
    fn test_to_smflags_2() {
        assert_eq!(to_smflags(1), Err(Errno::EINVAL));
    }

    #[test]
    fn test_to_smflags_3() {
        assert_eq!(to_smflags(0xFF), Err(Errno::EINVAL));
    }

    #[test]
    fn test_to_smflags_4() {
        assert_eq!(to_smflags(1u64 << 33), Ok(SecretMemFlags::empty()),);
    }
}