syd 3.52.0

rock-solid application kernel
Documentation
//
// Syd: rock-solid application kernel
// src/kernel/fanotify.rs: fanotify_mark(2) handler
//
// Copyright (c) 2023, 2024, 2025, 2026 Ali Polatel <alip@chesswob.org>
//
// SPDX-License-Identifier: GPL-3.0

// SAFETY: This module has been liberated from unsafe code!
#![forbid(unsafe_code)]

use std::os::fd::AsRawFd;

use libseccomp::ScmpNotifResp;
use nix::errno::Errno;

use crate::{
    compat::{Fanotify, FsType, MarkFlags, MaskFlags},
    confine::{scmp_arch_bits, scmp_arch_is_big_endian},
    fd::{to_fd, PROC_FILE},
    fs::readlinkat,
    kernel::syscall_path_handler,
    lookup::{file_type, FsFlags},
    path::XPathBuf,
    req::{SysArg, SysFlags, UNotifyEventRequest},
};

// fanotify_mark(2) commands
const MARK_CMD: MarkFlags = MarkFlags::from_bits_retain(
    MarkFlags::FAN_MARK_ADD.bits()
        | MarkFlags::FAN_MARK_REMOVE.bits()
        | MarkFlags::FAN_MARK_FLUSH.bits(),
);

// fanotify_mark(2) type bits
const MARK_TYPE: MarkFlags = MarkFlags::from_bits_retain(
    // FAN_MARK_INODE == 0
    MarkFlags::FAN_MARK_FILESYSTEM.bits()
        | MarkFlags::FAN_MARK_MNTNS.bits()
        | MarkFlags::FAN_MARK_MOUNT.bits(),
);
const FAN_MARK_INODE: MarkFlags = MarkFlags::from_bits_retain(0);

#[expect(clippy::arithmetic_side_effects)]
pub(crate) fn sys_fanotify_mark(request: UNotifyEventRequest) -> ScmpNotifResp {
    let req = request.scmpreq;

    // Linux kernel truncates upper bits.
    #[expect(clippy::cast_possible_truncation)]
    let flags = req.data.args[1] as libc::c_uint;

    // Reject invalid flags.
    let flags = match MarkFlags::from_bits(flags) {
        Some(flags) => flags,
        None => return request.fail_syscall(Errno::EINVAL),
    };

    // Linux rejects the combination FAN_MARK_IGNORE|FAN_MARK_IGNORED_MASK.
    if flags.contains(MarkFlags::FAN_MARK_IGNORE | MarkFlags::FAN_MARK_IGNORED_MASK) {
        return request.fail_syscall(Errno::EINVAL);
    }

    // Mark command must be exactly one of ADD, REMOVE, or FLUSH.
    let mark_cmd = flags & MARK_CMD;
    if !matches!(
        mark_cmd,
        MarkFlags::FAN_MARK_ADD | MarkFlags::FAN_MARK_REMOVE | MarkFlags::FAN_MARK_FLUSH
    ) {
        return request.fail_syscall(Errno::EINVAL);
    }

    // Reject undefined/invalid masks.
    //
    // Mask is a 64-bit value but kernel uses the lower 32-bits only.
    // On 32-bit big-endian, the 64-bit mask is split with high word in
    // args[2] and low word in args[3].
    let is32 = scmp_arch_bits(req.data.arch) == 32;
    let (mask, narg) = if is32 {
        let (lo, hi) = if scmp_arch_is_big_endian(req.data.arch) {
            (req.data.args[3], req.data.args[2])
        } else {
            (req.data.args[2], req.data.args[3])
        };

        // Linux rejects upper 32-bits in mask.
        if hi != 0 {
            return request.fail_syscall(Errno::EINVAL);
        }

        (lo, 3)
    } else {
        (req.data.args[2], 2)
    };

    let mask = match MaskFlags::from_bits(mask) {
        Some(mask) => mask,
        None => return request.fail_syscall(Errno::EINVAL),
    };

    // Commands ADD and REMOVE require a non-empty mask.
    if mask.is_empty()
        && matches!(
            mark_cmd,
            MarkFlags::FAN_MARK_ADD | MarkFlags::FAN_MARK_REMOVE
        )
    {
        return request.fail_syscall(Errno::EINVAL);
    }

    // Command FLUSH rejects extra flags beyond mark type and FLUSH.
    if mark_cmd == MarkFlags::FAN_MARK_FLUSH
        && !flags
            .difference(MARK_TYPE | MarkFlags::FAN_MARK_FLUSH)
            .is_empty()
    {
        return request.fail_syscall(Errno::EINVAL);
    }

    // Validate the FANotify FD.
    let notify_fd = match to_fd(req.data.args[0]) {
        Ok(fd) => fd,
        Err(errno) => return request.fail_syscall(errno),
    };

    // Get the FANotify FD.
    let notify_fd = match request.get_fd(notify_fd).map(Fanotify::from) {
        Ok(fd) => fd,
        Err(errno) => return request.fail_syscall(errno),
    };

    // Linux rejects non-fanotify fds with EINVAL before path lookup.
    match FsType::get(&notify_fd) {
        Ok(fst) if fst.is_anon_inode() => {
            let pfd = match XPathBuf::from_self_fd(notify_fd.as_raw_fd()) {
                Ok(pfd) => pfd,
                Err(errno) => return request.fail_syscall(errno),
            };
            match readlinkat(PROC_FILE(), &pfd) {
                Ok(target) if target.is_equal(b"anon_inode:[fanotify]") => {}
                _ => return request.fail_syscall(Errno::EINVAL),
            }
        }
        Ok(_) => return request.fail_syscall(Errno::EINVAL),
        Err(errno) => return request.fail_syscall(errno),
    }

    // Linux rejects invalid mark type with EINVAL.
    let mark_type = flags & MARK_TYPE;
    if !matches!(
        mark_type,
        FAN_MARK_INODE
            | MarkFlags::FAN_MARK_MOUNT
            | MarkFlags::FAN_MARK_FILESYSTEM
            | MarkFlags::FAN_MARK_MNTNS
    ) {
        return request.fail_syscall(Errno::EINVAL);
    }

    // fanotify(7) requires read access to the file or directory.
    let mut fsflags = FsFlags::MUST_PATH;
    if flags.contains(MarkFlags::FAN_MARK_DONT_FOLLOW) {
        fsflags |= FsFlags::NO_FOLLOW_LAST;
    }

    let pidx = narg + 2;

    // Linux accepts NULL pathname with AT_FDCWD.
    // Treat it like AT_EMPTY_PATH.
    let argv = &[SysArg {
        dirfd: Some(narg + 1),
        path: Some(pidx),
        flags: SysFlags::EMPTY_PATH | SysFlags::MAYBE_NULL,
        fsflags,
    }];

    syscall_path_handler(
        request,
        "fanotify_mark",
        argv,
        |path_args, request, sandbox| {
            let restrict_notify_bdev = !sandbox.flags.allow_unsafe_notify_bdev();
            let restrict_notify_cdev = !sandbox.flags.allow_unsafe_notify_cdev();
            drop(sandbox); // release the read-lock.

            // SysArg has one element.
            #[expect(clippy::disallowed_methods)]
            let fd = path_args.0.as_ref().unwrap().path.dir();

            // Strip FAN_{ACCESS,ACCESS_PERM,MODIFY} if we're marking a sidechannel device.
            // Strip FAN_MARK_DONT_FOLLOW which has already been handled during canonicalization.
            let mut mask = mask;
            if restrict_notify_bdev || restrict_notify_cdev {
                // Strip IN_{ACCESS,MODIFY} if we're marking a sidechannel device.
                // Strip IN_DONT_FOLLOW which has already been handled during canonicalization.
                let filetype = file_type(fd, None, false)?;
                if (restrict_notify_bdev && filetype.is_block_device())
                    || (restrict_notify_cdev && filetype.is_char_device())
                {
                    mask.remove(MaskFlags::FAN_ACCESS);
                    mask.remove(MaskFlags::FAN_ACCESS_PERM);
                    mask.remove(MaskFlags::FAN_MODIFY);
                }
            }
            let mut flags = flags;
            flags.remove(MarkFlags::FAN_MARK_DONT_FOLLOW);

            // We open a FD to the path and then use the proc(5) path
            // $PROC_FILE/thread-self/fd/$fd in address' path argument
            // to avoid symlink TOCTOU.
            let pfd = XPathBuf::from_self_fd(fd.as_raw_fd())?;

            // Record blocking call so it can get invalidated.
            request.cache.add_sys_block(req, false)?;

            // Call fanotify_mark(2) through type-safe interface.
            let result = notify_fd.mark(flags, mask, PROC_FILE(), Some(&pfd));

            // Remove invalidation record.
            request.cache.del_sys_block(req.id)?;

            result.map(|_| request.return_syscall(0))
        },
    )
}