Skip to main content

libcontainer/syscall/
linux.rs

1//! Implements Command trait for Linux systems
2use std::any::Any;
3use std::ffi::{CStr, CString, OsStr};
4use std::os::fd::{BorrowedFd, FromRawFd, RawFd};
5use std::os::unix::ffi::OsStrExt;
6use std::os::unix::fs::symlink;
7use std::os::unix::io::{AsRawFd, OwnedFd};
8use std::path::Path;
9use std::str::FromStr;
10use std::sync::Arc;
11use std::{mem, ptr};
12
13use caps::{CapSet, CapsHashSet};
14use libc::{c_char, setdomainname, uid_t};
15use nix::dir::Dir;
16use nix::fcntl;
17use nix::fcntl::{OFlag, open};
18use nix::mount::{MntFlags, MsFlags, mount, umount2};
19use nix::sched::{CloneFlags, unshare};
20use nix::sys::stat::{Mode, SFlag, mknod};
21use nix::unistd::{Gid, Uid, chown, chroot, close, fchdir, pivot_root, sethostname};
22use oci_spec::runtime::PosixRlimit;
23use pathrs::flags::OpenFlags;
24use pathrs::procfs::{ProcfsBase, ProcfsHandle};
25
26use super::{Result, Syscall, SyscallError};
27use crate::capabilities;
28use crate::config::PersonalityDomain;
29
30// Flags used in mount_setattr(2).
31// see https://man7.org/linux/man-pages/man2/mount_setattr.2.html.
32pub const AT_RECURSIVE: u32 = 0x00008000; // Change the mount properties of the entire mount tree.
33pub const AT_EMPTY_PATH: u32 = 0x00001000;
34#[allow(non_upper_case_globals)]
35pub const MOUNT_ATTR__ATIME: u64 = 0x00000070; // Setting on how atime should be updated.
36pub const MOUNT_ATTR_RDONLY: u64 = 0x00000001;
37pub const MOUNT_ATTR_NOSUID: u64 = 0x00000002;
38pub const MOUNT_ATTR_NODEV: u64 = 0x00000004;
39pub const MOUNT_ATTR_NOEXEC: u64 = 0x00000008;
40pub const MOUNT_ATTR_RELATIME: u64 = 0x00000000;
41pub const MOUNT_ATTR_NOATIME: u64 = 0x00000010;
42pub const MOUNT_ATTR_STRICTATIME: u64 = 0x00000020;
43pub const MOUNT_ATTR_NODIRATIME: u64 = 0x00000080;
44pub const MOUNT_ATTR_NOSYMFOLLOW: u64 = 0x00200000;
45pub const MOVE_MOUNT_F_EMPTY_PATH: u32 = 0x00000004;
46pub const MOVE_MOUNT_T_EMPTY_PATH: u32 = 0x00000040;
47
48// The type of fsconfig() call made.
49pub const FSCONFIG_SET_FLAG: u64 = 0;
50pub const FSCONFIG_SET_STRING: u64 = 1;
51pub const FSCONFIG_SET_BINARY: u64 = 2;
52pub const FSCONFIG_SET_PATH: u64 = 3;
53pub const FSCONFIG_SET_PATH_EMPTY: u64 = 4;
54pub const FSCONFIG_SET_FD: u64 = 5;
55pub const FSCONFIG_CMD_CREATE: u64 = 6;
56pub const FSCONFIG_CMD_RECONFIGURE: u64 = 7;
57pub const FSCONFIG_CMD_CREATE_EXCL: u64 = 8;
58
59/// Constants used by mount(2).
60pub enum MountOption {
61    Defaults(bool, MsFlags),
62    Ro(bool, MsFlags),
63    Rw(bool, MsFlags),
64    Suid(bool, MsFlags),
65    Nosuid(bool, MsFlags),
66    Dev(bool, MsFlags),
67    Nodev(bool, MsFlags),
68    Exec(bool, MsFlags),
69    Noexec(bool, MsFlags),
70    Sync(bool, MsFlags),
71    Async(bool, MsFlags),
72    Dirsync(bool, MsFlags),
73    Remount(bool, MsFlags),
74    Mand(bool, MsFlags),
75    Nomand(bool, MsFlags),
76    Atime(bool, MsFlags),
77    Noatime(bool, MsFlags),
78    Diratime(bool, MsFlags),
79    Nodiratime(bool, MsFlags),
80    Bind(bool, MsFlags),
81    Rbind(bool, MsFlags),
82    Unbindable(bool, MsFlags),
83    Runbindable(bool, MsFlags),
84    Private(bool, MsFlags),
85    Rprivate(bool, MsFlags),
86    Shared(bool, MsFlags),
87    Rshared(bool, MsFlags),
88    Slave(bool, MsFlags),
89    Rslave(bool, MsFlags),
90    Relatime(bool, MsFlags),
91    Norelatime(bool, MsFlags),
92    Strictatime(bool, MsFlags),
93    Nostrictatime(bool, MsFlags),
94}
95
96impl MountOption {
97    // Return all possible mount options
98    pub fn known_options() -> Vec<String> {
99        [
100            "defaults",
101            "ro",
102            "rw",
103            "suid",
104            "nosuid",
105            "dev",
106            "nodev",
107            "exec",
108            "noexec",
109            "sync",
110            "async",
111            "dirsync",
112            "remount",
113            "mand",
114            "nomand",
115            "atime",
116            "noatime",
117            "diratime",
118            "nodiratime",
119            "bind",
120            "rbind",
121            "unbindable",
122            "runbindable",
123            "private",
124            "rprivate",
125            "shared",
126            "rshared",
127            "slave",
128            "rslave",
129            "relatime",
130            "norelatime",
131            "strictatime",
132            "nostrictatime",
133        ]
134        .iter()
135        .map(|s| s.to_string())
136        .collect()
137    }
138}
139
140impl FromStr for MountOption {
141    type Err = String;
142
143    fn from_str(option: &str) -> std::result::Result<Self, Self::Err> {
144        match option {
145            "defaults" => Ok(MountOption::Defaults(false, MsFlags::empty())),
146            "ro" => Ok(MountOption::Ro(false, MsFlags::MS_RDONLY)),
147            "rw" => Ok(MountOption::Rw(true, MsFlags::MS_RDONLY)),
148            "suid" => Ok(MountOption::Suid(true, MsFlags::MS_NOSUID)),
149            "nosuid" => Ok(MountOption::Nosuid(false, MsFlags::MS_NOSUID)),
150            "dev" => Ok(MountOption::Dev(true, MsFlags::MS_NODEV)),
151            "nodev" => Ok(MountOption::Nodev(false, MsFlags::MS_NODEV)),
152            "exec" => Ok(MountOption::Exec(true, MsFlags::MS_NOEXEC)),
153            "noexec" => Ok(MountOption::Noexec(false, MsFlags::MS_NOEXEC)),
154            "sync" => Ok(MountOption::Sync(false, MsFlags::MS_SYNCHRONOUS)),
155            "async" => Ok(MountOption::Async(true, MsFlags::MS_SYNCHRONOUS)),
156            "dirsync" => Ok(MountOption::Dirsync(false, MsFlags::MS_DIRSYNC)),
157            "remount" => Ok(MountOption::Remount(false, MsFlags::MS_REMOUNT)),
158            "mand" => Ok(MountOption::Mand(false, MsFlags::MS_MANDLOCK)),
159            "nomand" => Ok(MountOption::Nomand(true, MsFlags::MS_MANDLOCK)),
160            "atime" => Ok(MountOption::Atime(true, MsFlags::MS_NOATIME)),
161            "noatime" => Ok(MountOption::Noatime(false, MsFlags::MS_NOATIME)),
162            "diratime" => Ok(MountOption::Diratime(true, MsFlags::MS_NODIRATIME)),
163            "nodiratime" => Ok(MountOption::Nodiratime(false, MsFlags::MS_NODIRATIME)),
164            "bind" => Ok(MountOption::Bind(false, MsFlags::MS_BIND)),
165            "rbind" => Ok(MountOption::Rbind(
166                false,
167                MsFlags::MS_BIND | MsFlags::MS_REC,
168            )),
169            "unbindable" => Ok(MountOption::Unbindable(false, MsFlags::MS_UNBINDABLE)),
170            "runbindable" => Ok(MountOption::Runbindable(
171                false,
172                MsFlags::MS_UNBINDABLE | MsFlags::MS_REC,
173            )),
174            "private" => Ok(MountOption::Private(true, MsFlags::MS_PRIVATE)),
175            "rprivate" => Ok(MountOption::Rprivate(
176                true,
177                MsFlags::MS_PRIVATE | MsFlags::MS_REC,
178            )),
179            "shared" => Ok(MountOption::Shared(true, MsFlags::MS_SHARED)),
180            "rshared" => Ok(MountOption::Rshared(
181                true,
182                MsFlags::MS_SHARED | MsFlags::MS_REC,
183            )),
184            "slave" => Ok(MountOption::Slave(true, MsFlags::MS_SLAVE)),
185            "rslave" => Ok(MountOption::Rslave(
186                true,
187                MsFlags::MS_SLAVE | MsFlags::MS_REC,
188            )),
189            "relatime" => Ok(MountOption::Relatime(false, MsFlags::MS_RELATIME)),
190            "norelatime" => Ok(MountOption::Norelatime(true, MsFlags::MS_RELATIME)),
191            "strictatime" => Ok(MountOption::Strictatime(false, MsFlags::MS_STRICTATIME)),
192            "nostrictatime" => Ok(MountOption::Nostrictatime(true, MsFlags::MS_STRICTATIME)),
193            _ => Err(option.to_string()),
194        }
195    }
196}
197
198/// Constants used by mount_setattr(2).
199pub enum MountRecursive {
200    /// Mount read-only.
201    Rdonly(bool, u64),
202
203    /// Ignore suid and sgid bits.
204    Nosuid(bool, u64),
205
206    /// Disallow access to device special files.
207    Nodev(bool, u64),
208
209    /// Disallow program execution.
210    Noexec(bool, u64),
211
212    /// Setting on how atime should be updated.
213    Atime(bool, u64),
214
215    /// Update atime relative to mtime/ctime.
216    Relatime(bool, u64),
217
218    /// Do not update access times.
219    Noatime(bool, u64),
220
221    /// Always perform atime updates.
222    StrictAtime(bool, u64),
223
224    /// Do not update directory access times.
225    NoDiratime(bool, u64),
226
227    /// Prevents following symbolic links.
228    Nosymfollow(bool, u64),
229}
230
231impl FromStr for MountRecursive {
232    type Err = SyscallError;
233
234    fn from_str(option: &str) -> std::result::Result<Self, Self::Err> {
235        match option {
236            "rro" => Ok(MountRecursive::Rdonly(false, MOUNT_ATTR_RDONLY)),
237            "rrw" => Ok(MountRecursive::Rdonly(true, MOUNT_ATTR_RDONLY)),
238            "rnosuid" => Ok(MountRecursive::Nosuid(false, MOUNT_ATTR_NOSUID)),
239            "rsuid" => Ok(MountRecursive::Nosuid(true, MOUNT_ATTR_NOSUID)),
240            "rnodev" => Ok(MountRecursive::Nodev(false, MOUNT_ATTR_NODEV)),
241            "rdev" => Ok(MountRecursive::Nodev(true, MOUNT_ATTR_NODEV)),
242            "rnoexec" => Ok(MountRecursive::Noexec(false, MOUNT_ATTR_NOEXEC)),
243            "rexec" => Ok(MountRecursive::Noexec(true, MOUNT_ATTR_NOEXEC)),
244            "rnodiratime" => Ok(MountRecursive::NoDiratime(false, MOUNT_ATTR_NODIRATIME)),
245            "rdiratime" => Ok(MountRecursive::NoDiratime(true, MOUNT_ATTR_NODIRATIME)),
246            "rrelatime" => Ok(MountRecursive::Relatime(false, MOUNT_ATTR_RELATIME)),
247            "rnorelatime" => Ok(MountRecursive::Relatime(true, MOUNT_ATTR_RELATIME)),
248            "rnoatime" => Ok(MountRecursive::Noatime(false, MOUNT_ATTR_NOATIME)),
249            "ratime" => Ok(MountRecursive::Noatime(true, MOUNT_ATTR_NOATIME)),
250            "rstrictatime" => Ok(MountRecursive::StrictAtime(false, MOUNT_ATTR_STRICTATIME)),
251            "rnostrictatime" => Ok(MountRecursive::StrictAtime(true, MOUNT_ATTR_STRICTATIME)),
252            "rnosymfollow" => Ok(MountRecursive::Nosymfollow(false, MOUNT_ATTR_NOSYMFOLLOW)),
253            "rsymfollow" => Ok(MountRecursive::Nosymfollow(true, MOUNT_ATTR_NOSYMFOLLOW)),
254            // No support for MOUNT_ATTR_IDMAP yet (needs UserNS FD)
255            _ => Err(SyscallError::UnexpectedMountRecursiveOption(
256                option.to_string(),
257            )),
258        }
259    }
260}
261
262#[repr(C)]
263#[derive(Debug, Clone, PartialEq, Eq)]
264/// A structure used as te third argument of mount_setattr(2).
265pub struct MountAttr {
266    /// Mount properties to set.
267    pub attr_set: u64,
268
269    /// Mount properties to clear.
270    pub attr_clr: u64,
271
272    /// Mount propagation type.
273    pub propagation: u64,
274
275    /// User namespace file descriptor.
276    pub userns_fd: u64,
277}
278
279impl MountAttr {
280    /// Return MountAttr with the flag raised.
281    /// This function is used in test code.
282    pub fn all() -> Self {
283        MountAttr {
284            attr_set: MOUNT_ATTR_RDONLY
285                | MOUNT_ATTR_NOSUID
286                | MOUNT_ATTR_NODEV
287                | MOUNT_ATTR_NOEXEC
288                | MOUNT_ATTR_NODIRATIME
289                | MOUNT_ATTR_RELATIME
290                | MOUNT_ATTR_NOATIME
291                | MOUNT_ATTR_STRICTATIME
292                | MOUNT_ATTR_NOSYMFOLLOW,
293            attr_clr: MOUNT_ATTR_RDONLY
294                | MOUNT_ATTR_NOSUID
295                | MOUNT_ATTR_NODEV
296                | MOUNT_ATTR_NOEXEC
297                | MOUNT_ATTR_NODIRATIME
298                | MOUNT_ATTR_RELATIME
299                | MOUNT_ATTR_NOATIME
300                | MOUNT_ATTR_STRICTATIME
301                | MOUNT_ATTR_NOSYMFOLLOW
302                | MOUNT_ATTR__ATIME,
303            propagation: 0,
304            userns_fd: 0,
305        }
306    }
307}
308
309/// Empty structure to implement Command trait for
310#[derive(Clone)]
311pub struct LinuxSyscall;
312
313impl LinuxSyscall {
314    unsafe fn from_raw_buf<'a, T>(p: *const c_char) -> T
315    where
316        T: From<&'a OsStr>,
317    {
318        unsafe { T::from(OsStr::from_bytes(CStr::from_ptr(p).to_bytes())) }
319    }
320
321    /// Reads data from the `c_passwd` and returns it as a `User`.
322    unsafe fn passwd_to_user(passwd: libc::passwd) -> Arc<OsStr> {
323        let name: Arc<OsStr> = unsafe { Self::from_raw_buf(passwd.pw_name) };
324        name
325    }
326
327    fn emulate_close_range(preserve_fds: i32) -> Result<()> {
328        let open_fds = Self::get_open_fds()?;
329        // Include stdin, stdout, and stderr for fd 0, 1, and 2 respectively.
330        let min_fd = preserve_fds + 3;
331        let to_be_cleaned_up_fds: Vec<i32> = open_fds
332            .iter()
333            .filter_map(|&fd| if fd >= min_fd { Some(fd) } else { None })
334            .collect();
335
336        to_be_cleaned_up_fds.iter().for_each(|&fd| {
337            // Intentionally ignore errors here -- the cases where this might fail
338            // are basically file descriptors that have already been closed.
339            let _ = fcntl::fcntl(fd, fcntl::F_SETFD(fcntl::FdFlag::FD_CLOEXEC));
340        });
341
342        Ok(())
343    }
344
345    // Get a list of open fds for the calling process.
346    fn get_open_fds() -> Result<Vec<i32>> {
347        let dir = ProcfsHandle::new()?.open(
348            ProcfsBase::ProcSelf,
349            Path::new("fd"),
350            OpenFlags::O_DIRECTORY | OpenFlags::O_CLOEXEC,
351        )?;
352
353        let fds = Dir::from(dir)?
354            .into_iter()
355            .filter_map(|entry| entry.ok())
356            .filter_map(|entry| {
357                // Convert the file name from string into i32. Since we are looking
358                // at /proc/<pid>/fd, anything that's not a number (i32) can be
359                // ignored. We are only interested in opened fds.
360                entry
361                    .file_name()
362                    .to_str()
363                    .ok()
364                    .and_then(|name| name.parse::<i32>().ok())
365            })
366            .collect();
367
368        Ok(fds)
369    }
370}
371
372impl Syscall for LinuxSyscall {
373    /// To enable dynamic typing,
374    /// see <https://doc.rust-lang.org/std/any/index.html> for more information
375    fn as_any(&self) -> &dyn Any {
376        self
377    }
378
379    /// Function to set given path as root path inside process
380    fn pivot_rootfs(&self, path: &Path) -> Result<()> {
381        // open the path as directory and read only
382        let newroot = open(
383            path,
384            OFlag::O_DIRECTORY | OFlag::O_RDONLY | OFlag::O_CLOEXEC,
385            Mode::empty(),
386        )
387        .inspect_err(|errno| {
388            tracing::error!(?errno, ?path, "failed to open the new root for pivot root");
389        })?;
390
391        // make the given path as the root directory for the container
392        // see https://man7.org/linux/man-pages/man2/pivot_root.2.html, specially the notes
393        // pivot root usually changes the root directory to first argument, and then mounts the original root
394        // directory at second argument. Giving same path for both stacks mapping of the original root directory
395        // above the new directory at the same path, then the call to umount unmounts the original root directory from
396        // this path. This is done, as otherwise, we will need to create a separate temporary directory under the new root path
397        // so we can move the original root there, and then unmount that. This way saves the creation of the temporary
398        // directory to put original root directory.
399        pivot_root(path, path).inspect_err(|errno| {
400            tracing::error!(?errno, ?path, "failed to pivot root to");
401        })?;
402
403        // Make the original root directory rslave to avoid propagating unmount event to the host mount namespace.
404        // We should use MS_SLAVE not MS_PRIVATE according to https://github.com/opencontainers/runc/pull/1500.
405        mount(
406            None::<&str>,
407            "/",
408            None::<&str>,
409            MsFlags::MS_SLAVE | MsFlags::MS_REC,
410            None::<&str>,
411        )
412        .inspect_err(|errno| {
413            tracing::error!(?errno, "failed to make original root directory rslave");
414        })?;
415
416        // Unmount the original root directory which was stacked on top of new root directory
417        // MNT_DETACH makes the mount point unavailable to new accesses, but waits till the original mount point
418        // to be free of activity to actually unmount
419        // see https://man7.org/linux/man-pages/man2/umount2.2.html for more information
420        umount2("/", MntFlags::MNT_DETACH).inspect_err(|errno| {
421            tracing::error!(?errno, "failed to unmount old root directory");
422        })?;
423        // Change directory to the new root
424        fchdir(newroot).inspect_err(|errno| {
425            tracing::error!(?errno, ?newroot, "failed to change directory to new root");
426        })?;
427
428        close(newroot).inspect_err(|errno| {
429            tracing::error!(?errno, ?newroot, "failed to close new root directory");
430        })?;
431
432        Ok(())
433    }
434
435    /// Set namespace for process
436    fn set_ns(&self, rawfd: i32, nstype: CloneFlags) -> Result<()> {
437        let fd = unsafe { BorrowedFd::borrow_raw(rawfd) };
438        nix::sched::setns(fd, nstype)?;
439        Ok(())
440    }
441
442    /// set uid and gid for process
443    fn set_id(&self, uid: Uid, gid: Gid) -> Result<()> {
444        prctl::set_keep_capabilities(true).map_err(|errno| {
445            tracing::error!(?errno, "failed to set keep capabilities to true");
446            nix::errno::Errno::from_raw(errno)
447        })?;
448        // args : real *id, effective *id, saved set *id respectively
449
450        // This is safe because at this point we have only
451        // one thread in the process
452        if unsafe { libc::syscall(libc::SYS_setresgid, gid, gid, gid) } == -1 {
453            let err = nix::errno::Errno::last();
454            tracing::error!(
455                ?err,
456                ?gid,
457                "failed to set real, effective and saved set gid"
458            );
459            return Err(err.into());
460        }
461
462        // This is safe because at this point we have only
463        // one thread in the process
464        if unsafe { libc::syscall(libc::SYS_setresuid, uid, uid, uid) } == -1 {
465            let err = nix::errno::Errno::last();
466            tracing::error!(
467                ?err,
468                ?uid,
469                "failed to set real, effective and saved set uid"
470            );
471            return Err(err.into());
472        }
473
474        // if not the root user, reset capabilities to effective capabilities,
475        // which are used by kernel to perform checks
476        // see https://man7.org/linux/man-pages/man7/capabilities.7.html for more information
477        if uid != Uid::from_raw(0) {
478            capabilities::reset_effective(self)?;
479        }
480        prctl::set_keep_capabilities(false).map_err(|errno| {
481            tracing::error!(?errno, "failed to set keep capabilities to false");
482            nix::errno::Errno::from_raw(errno)
483        })?;
484        Ok(())
485    }
486
487    /// Disassociate parts of execution context
488    // see https://man7.org/linux/man-pages/man2/unshare.2.html for more information
489    fn unshare(&self, flags: CloneFlags) -> Result<()> {
490        unshare(flags)?;
491
492        Ok(())
493    }
494    /// Set capabilities for container process
495    fn set_capability(&self, cset: CapSet, value: &CapsHashSet) -> Result<()> {
496        match cset {
497            // caps::set cannot set capabilities in bounding set,
498            // so we do it differently
499            CapSet::Bounding => {
500                // get all capabilities
501                let all = caps::read(None, CapSet::Bounding)?;
502                // the difference will give capabilities
503                // which are to be unset
504                // for each such =, drop that capability
505                // after this, only those which are to be set will remain set
506                for c in all.difference(value) {
507                    caps::drop(None, CapSet::Bounding, *c)?
508                }
509            }
510            CapSet::Ambient => {
511                // check specifically for ambient, as those might not always be available
512                //
513                // Ambient capabilities are applied from an unordered HashSet, and if any
514                // set_capability() call fails, Youki stops applying the rest. This causes
515                // inconsistent CapAmb results between runs and diverges from runc, which
516                // continues applying all ambient caps even after a failure. The same flawed
517                // ambient-cap logic also causes exec-path capability test failures.
518                caps::clear(None, CapSet::Ambient)?;
519                for c in value {
520                    if let Err(e) = caps::raise(None, CapSet::Ambient, *c) {
521                        tracing::warn!(?e, ?c, "can't raise ambient capability");
522                    }
523                }
524            }
525            _ => {
526                caps::set(None, cset, value)?;
527            }
528        }
529        Ok(())
530    }
531
532    /// Sets hostname for process
533    fn set_hostname(&self, hostname: &str) -> Result<()> {
534        sethostname(hostname)?;
535        Ok(())
536    }
537
538    /// Sets domainname for process (see
539    /// [setdomainname(2)](https://man7.org/linux/man-pages/man2/setdomainname.2.html)).
540    fn set_domainname(&self, domainname: &str) -> Result<()> {
541        let ptr = domainname.as_bytes().as_ptr() as *const c_char;
542        let len = domainname.len();
543        match unsafe { setdomainname(ptr, len) } {
544            0 => Ok(()),
545            -1 => Err(nix::Error::last()),
546
547            _ => Err(nix::Error::UnknownErrno),
548        }?;
549
550        Ok(())
551    }
552
553    /// Sets resource limit for process
554    fn set_rlimit(&self, rlimit: &PosixRlimit) -> Result<()> {
555        let rlim = &libc::rlimit {
556            rlim_cur: rlimit.soft(),
557            rlim_max: rlimit.hard(),
558        };
559
560        // Change for musl libc based on seccomp needs
561        #[cfg(not(target_env = "musl"))]
562        let res = unsafe { libc::setrlimit(rlimit.typ() as u32, rlim) };
563        #[cfg(target_env = "musl")]
564        let res = unsafe { libc::setrlimit(rlimit.typ() as i32, rlim) };
565
566        match res {
567            0 => Ok(()),
568            -1 => Err(SyscallError::Nix(nix::Error::last())),
569            _ => Err(SyscallError::Nix(nix::Error::UnknownErrno)),
570        }?;
571
572        Ok(())
573    }
574
575    // taken from https://crates.io/crates/users
576    fn get_pwuid(&self, uid: uid_t) -> Option<Arc<OsStr>> {
577        let mut passwd = unsafe { mem::zeroed::<libc::passwd>() };
578        let mut buf = vec![0; 2048];
579        let mut result = ptr::null_mut::<libc::passwd>();
580
581        loop {
582            let r = unsafe {
583                libc::getpwuid_r(uid, &mut passwd, buf.as_mut_ptr(), buf.len(), &mut result)
584            };
585
586            if r != libc::ERANGE {
587                break;
588            }
589
590            let newsize = buf.len().checked_mul(2)?;
591            buf.resize(newsize, 0);
592        }
593
594        if result.is_null() {
595            // There is no such user, or an error has occurred.
596            // errno gets set if there's an error.
597            return None;
598        }
599
600        if result != &mut passwd {
601            // The result of getpwuid_r should be its input passwd.
602            return None;
603        }
604
605        let user = unsafe { Self::passwd_to_user(result.read()) };
606        Some(user)
607    }
608
609    fn chroot(&self, path: &Path) -> Result<()> {
610        chroot(path)?;
611
612        Ok(())
613    }
614
615    fn mount(
616        &self,
617        source: Option<&Path>,
618        target: &Path,
619        fstype: Option<&str>,
620        flags: MsFlags,
621        data: Option<&str>,
622    ) -> Result<()> {
623        mount(source, target, fstype, flags, data)?;
624        Ok(())
625    }
626
627    fn mount_from_fd(&self, source_fd: &OwnedFd, target: &Path) -> Result<()> {
628        let parent = target.parent().ok_or_else(|| {
629            tracing::error!(?target, "target has no parent");
630            SyscallError::Nix(nix::Error::EINVAL)
631        })?;
632        let name = target.file_name().ok_or_else(|| {
633            tracing::error!(?target, "target has no file name");
634            SyscallError::Nix(nix::Error::EINVAL)
635        })?;
636
637        let parent_fd = unsafe {
638            OwnedFd::from_raw_fd(open(
639                parent,
640                OFlag::O_PATH | OFlag::O_CLOEXEC | OFlag::O_DIRECTORY,
641                Mode::empty(),
642            )?)
643        };
644
645        let open_tree_flags: libc::c_uint = (libc::OPEN_TREE_CLOEXEC as libc::c_uint)
646            | (libc::OPEN_TREE_CLONE as libc::c_uint)
647            | (libc::AT_EMPTY_PATH as libc::c_uint);
648
649        const EMPTY_PATH: [libc::c_char; 1] = [0];
650
651        let mount_fd_raw = unsafe {
652            libc::syscall(
653                libc::SYS_open_tree,
654                source_fd.as_raw_fd(),
655                EMPTY_PATH.as_ptr(),
656                open_tree_flags,
657            )
658        };
659
660        if mount_fd_raw < 0 {
661            let err = nix::errno::Errno::last();
662            tracing::error!(?err, "open_tree from fd failed");
663            return Err(SyscallError::Nix(err));
664        }
665        let mount_fd = unsafe { OwnedFd::from_raw_fd(mount_fd_raw as RawFd) };
666
667        let name_cstr = CString::new(name.as_bytes()).map_err(|err| {
668            tracing::error!(?target, ?err, "failed to convert file name to cstring");
669            SyscallError::Nix(nix::Error::EINVAL)
670        })?;
671
672        let res = unsafe {
673            libc::syscall(
674                libc::SYS_move_mount,
675                mount_fd.as_raw_fd(),
676                EMPTY_PATH.as_ptr(),
677                parent_fd.as_raw_fd(),
678                name_cstr.as_ptr(),
679                MOVE_MOUNT_F_EMPTY_PATH as libc::c_uint,
680            )
681        };
682
683        if res < 0 {
684            let err = nix::errno::Errno::last();
685            tracing::error!(?target, ?err, "move_mount failed");
686            return Err(SyscallError::Nix(err));
687        }
688
689        Ok(())
690    }
691
692    fn move_mount(
693        &self,
694        from_dirfd: BorrowedFd<'_>,
695        from_path: Option<&str>,
696        to_dirfd: BorrowedFd<'_>,
697        to_path: Option<&str>,
698        flags: u32,
699    ) -> Result<()> {
700        const EMPTY_PATH: [libc::c_char; 1] = [0];
701
702        let from_cstr: Option<CString> = from_path
703            .and_then(|s| if s.is_empty() { None } else { Some(s) })
704            .map(|s| CString::new(s).map_err(|_| nix::Error::EINVAL))
705            .transpose()?;
706        let from_ptr = from_cstr
707            .as_ref()
708            .map_or(EMPTY_PATH.as_ptr(), |c| c.as_ptr());
709
710        let to_cstr: Option<CString> = to_path
711            .and_then(|s| if s.is_empty() { None } else { Some(s) })
712            .map(|s| CString::new(s).map_err(|_| nix::Error::EINVAL))
713            .transpose()?;
714        let to_ptr = to_cstr.as_ref().map_or(EMPTY_PATH.as_ptr(), |c| c.as_ptr());
715
716        let rc = unsafe {
717            libc::syscall(
718                libc::SYS_move_mount,
719                from_dirfd,
720                from_ptr,
721                to_dirfd,
722                to_ptr,
723                flags as libc::c_uint,
724            )
725        };
726
727        match rc {
728            0 => Ok(()),
729            -1 => Err(nix::Error::last().into()),
730            _ => Err(nix::Error::UnknownErrno.into()),
731        }
732    }
733
734    fn fsopen(&self, fstype: Option<&str>, flags: u32) -> Result<OwnedFd> {
735        let t_cstr: Option<CString> = fstype
736            .map(|t| CString::new(t).map_err(|_| SyscallError::Nix(nix::errno::Errno::EINVAL)))
737            .transpose()?;
738
739        let t_ptr = t_cstr.as_ref().map_or(std::ptr::null(), |c| c.as_ptr());
740
741        let fd =
742            unsafe { libc::syscall(libc::SYS_fsopen, t_ptr, flags as libc::c_uint) } as libc::c_int;
743        if fd < 0 {
744            return Err(SyscallError::Nix(nix::Error::last()));
745        }
746        Ok(unsafe { OwnedFd::from_raw_fd(fd) })
747    }
748
749    fn fsconfig(
750        &self,
751        fsfd: BorrowedFd<'_>,
752        cmd: u32,
753        key: Option<&str>,
754        val: Option<&str>,
755        aux: libc::c_int,
756    ) -> Result<()> {
757        let k_cstr: Option<CString> = key
758            .map(|k| CString::new(k).map_err(|_| SyscallError::Nix(nix::errno::Errno::EINVAL)))
759            .transpose()?;
760        let k_ptr = k_cstr.as_ref().map_or(std::ptr::null(), |k| k.as_ptr());
761
762        let v_cstr: Option<CString> = val
763            .map(|v| CString::new(v).map_err(|_| SyscallError::Nix(nix::errno::Errno::EINVAL)))
764            .transpose()?;
765        let v_ptr = v_cstr
766            .as_ref()
767            .map_or(std::ptr::null(), |v| v.as_ptr() as *const libc::c_void);
768
769        let rc = unsafe {
770            libc::syscall(
771                libc::SYS_fsconfig,
772                fsfd.as_raw_fd() as libc::c_int,
773                cmd as libc::c_uint,
774                k_ptr,
775                v_ptr,
776                aux,
777            )
778        };
779        if rc == -1 {
780            return Err(SyscallError::Nix(nix::Error::last()));
781        }
782        Ok(())
783    }
784
785    fn fsmount(
786        &self,
787        fsfd: BorrowedFd<'_>,
788        flags: u32,
789        attr_flags: Option<u64>,
790    ) -> Result<OwnedFd> {
791        let attr = attr_flags.unwrap_or(0);
792
793        let ret = unsafe {
794            libc::syscall(
795                libc::SYS_fsmount,
796                fsfd.as_raw_fd() as libc::c_int,
797                flags as libc::c_uint,
798                attr as libc::c_ulong,
799            )
800        } as libc::c_int;
801
802        if ret < 0 {
803            return Err(SyscallError::Nix(nix::Error::last()));
804        }
805        Ok(unsafe { std::os::fd::OwnedFd::from_raw_fd(ret) })
806    }
807
808    //dirfd is RawFd because we need to pass AT_FDCWD
809    fn open_tree(&self, dirfd: RawFd, path: Option<&str>, flags: u32) -> Result<OwnedFd> {
810        static EMPTY: [libc::c_char; 1] = [0];
811        let path_cstr: Option<CString> = path
812            .map(|s| CString::new(s).map_err(|_| SyscallError::Nix(nix::errno::Errno::EINVAL)))
813            .transpose()?;
814        let c_path: *const c_char = match path_cstr.as_ref() {
815            Some(cs) => cs.as_ptr(),
816            None => EMPTY.as_ptr(),
817        };
818
819        let fd = unsafe {
820            libc::syscall(
821                libc::SYS_open_tree,
822                dirfd as libc::c_int,
823                c_path,
824                flags as libc::c_uint,
825            )
826        } as libc::c_int;
827
828        if fd < 0 {
829            return Err(SyscallError::Nix(nix::Error::last()));
830        }
831        Ok(unsafe { OwnedFd::from_raw_fd(fd) })
832    }
833
834    fn symlink(&self, original: &Path, link: &Path) -> Result<()> {
835        symlink(original, link)?;
836
837        Ok(())
838    }
839
840    fn mknod(&self, path: &Path, kind: SFlag, perm: Mode, dev: u64) -> Result<()> {
841        mknod(path, kind, perm, dev)?;
842
843        Ok(())
844    }
845
846    fn chown(&self, path: &Path, owner: Option<Uid>, group: Option<Gid>) -> Result<()> {
847        chown(path, owner, group)?;
848
849        Ok(())
850    }
851
852    fn set_groups(&self, groups: &[Gid]) -> Result<()> {
853        let n_groups = groups.len() as libc::size_t;
854        let groups_ptr = groups.as_ptr() as *const libc::gid_t;
855
856        // This is safe because at this point we have only
857        // one thread in the process
858        if unsafe { libc::syscall(libc::SYS_setgroups, n_groups, groups_ptr) } == -1 {
859            let err = nix::errno::Errno::last();
860            tracing::error!(?err, ?groups, "failed to set groups");
861            return Err(err.into());
862        }
863        Ok(())
864    }
865
866    #[tracing::instrument(skip(self))]
867    fn close_range(&self, preserve_fds: i32) -> Result<()> {
868        match unsafe {
869            libc::syscall(
870                libc::SYS_close_range,
871                3 + preserve_fds,
872                libc::c_int::MAX,
873                libc::CLOSE_RANGE_CLOEXEC,
874            )
875        } {
876            0 => Ok(()),
877            -1 => {
878                match nix::errno::Errno::last() {
879                    nix::errno::Errno::ENOSYS | nix::errno::Errno::EINVAL => {
880                        // close_range was introduced in kernel 5.9 and CLOSEEXEC was introduced in
881                        // kernel 5.11. If the kernel is older we emulate close_range in userspace.
882                        Self::emulate_close_range(preserve_fds)
883                    }
884                    e => Err(SyscallError::Nix(e)),
885                }
886            }
887            _ => Err(SyscallError::Nix(nix::errno::Errno::UnknownErrno)),
888        }?;
889
890        Ok(())
891    }
892
893    fn mount_setattr(
894        &self,
895        dirfd: BorrowedFd<'_>,
896        pathname: &Path,
897        flags: u32,
898        mount_attr: &MountAttr,
899        size: libc::size_t,
900    ) -> Result<()> {
901        let path_c_string = pathname
902            .to_path_buf()
903            .to_str()
904            .map(CString::new)
905            .ok_or_else(|| {
906                tracing::error!(path = ?pathname, "failed to convert path to string");
907                nix::Error::EINVAL
908            })?
909            .map_err(|err| {
910                tracing::error!(path = ?pathname, ?err, "failed to convert path to string");
911                nix::Error::EINVAL
912            })?;
913
914        match unsafe {
915            libc::syscall(
916                libc::SYS_mount_setattr,
917                dirfd,
918                path_c_string.as_ptr(),
919                flags,
920                mount_attr as *const MountAttr,
921                size,
922            )
923        } {
924            0 => Ok(()),
925            -1 => Err(nix::Error::last()),
926            _ => Err(nix::Error::UnknownErrno),
927        }?;
928        Ok(())
929    }
930
931    fn set_io_priority(&self, class: i64, priority: i64) -> Result<()> {
932        let ioprio_who_progress: libc::c_int = 1;
933        let ioprio_who_pid = 0;
934        let iop = (class << 13) | priority;
935        match unsafe {
936            libc::syscall(
937                libc::SYS_ioprio_set,
938                ioprio_who_progress,
939                ioprio_who_pid,
940                iop as libc::c_ulong,
941            )
942        } {
943            0 => Ok(()),
944            -1 => Err(nix::Error::last()),
945            _ => Err(nix::Error::UnknownErrno),
946        }?;
947        Ok(())
948    }
949
950    fn set_mempolicy(&self, mode: i32, nodemask: &[libc::c_ulong], maxnode: u64) -> Result<()> {
951        // Convert Rust types to libc types
952        let libc_nodemask = if nodemask.is_empty() {
953            std::ptr::null()
954        } else {
955            nodemask.as_ptr()
956        };
957        let libc_maxnode = maxnode as libc::c_ulong;
958
959        match unsafe {
960            libc::syscall(
961                libc::SYS_set_mempolicy,
962                mode as libc::c_long,
963                libc_nodemask,
964                libc_maxnode,
965            )
966        } {
967            0 => Ok(()),
968            -1 => Err(SyscallError::Nix(nix::Error::last())),
969            _ => Err(SyscallError::Nix(nix::Error::UnknownErrno)),
970        }
971    }
972
973    fn umount2(&self, target: &Path, flags: MntFlags) -> Result<()> {
974        umount2(target, flags)?;
975        Ok(())
976    }
977
978    fn get_uid(&self) -> Uid {
979        nix::unistd::getuid()
980    }
981
982    fn get_gid(&self) -> Gid {
983        nix::unistd::getgid()
984    }
985
986    fn get_euid(&self) -> Uid {
987        nix::unistd::geteuid()
988    }
989
990    fn get_egid(&self) -> Gid {
991        nix::unistd::getegid()
992    }
993
994    fn personality(&self, domain: PersonalityDomain) -> Result<()> {
995        let domain = nix::sys::personality::Persona::from_bits_retain(domain as i32);
996        nix::sys::personality::set(domain)
997            .map(|_| ())
998            .map_err(|e| e.into())
999    }
1000}
1001
1002#[cfg(test)]
1003mod tests {
1004    // Note: We have to run these tests here as serial. The main issue is that
1005    // these tests has a dependency on the system state. The
1006    // cleanup_file_descriptors test is especially evil when running with other
1007    // tests because it would ran around close down different fds.
1008
1009    use std::fs;
1010    use std::os::unix::prelude::AsRawFd;
1011    use std::str::FromStr;
1012
1013    use anyhow::{Context, Result, bail};
1014    use nix::{fcntl, sys, unistd};
1015    use serial_test::serial;
1016
1017    use super::{LinuxSyscall, MountOption};
1018    use crate::syscall::Syscall;
1019
1020    #[test]
1021    #[serial]
1022    fn test_get_open_fds() -> Result<()> {
1023        let file = fs::File::open("/dev/null")?;
1024        let fd = file.as_raw_fd();
1025        let open_fds = LinuxSyscall::get_open_fds()?;
1026
1027        if !open_fds.contains(&fd) {
1028            bail!("failed to find the opened dev null fds: {:?}", open_fds);
1029        }
1030
1031        // explicitly close the file before the test case returns.
1032        drop(file);
1033
1034        // The stdio fds should also be contained in the list of opened fds.
1035        if ![0, 1, 2]
1036            .iter()
1037            .all(|&stdio_fd| open_fds.contains(&stdio_fd))
1038        {
1039            bail!("failed to find the stdio fds: {:?}", open_fds);
1040        }
1041
1042        Ok(())
1043    }
1044
1045    #[test]
1046    #[serial]
1047    fn test_close_range_userspace() -> Result<()> {
1048        // Open a fd without the CLOEXEC flag. Rust automatically adds the flag,
1049        // so we use fcntl::open here for more control.
1050        let fd = fcntl::open("/dev/null", fcntl::OFlag::O_RDWR, sys::stat::Mode::empty())?;
1051        LinuxSyscall::emulate_close_range(0).context("failed to clean up the fds")?;
1052
1053        let fd_flag = fcntl::fcntl(fd, fcntl::F_GETFD)?;
1054        if (fd_flag & fcntl::FdFlag::FD_CLOEXEC.bits()) == 0 {
1055            bail!("CLOEXEC flag is not set correctly");
1056        }
1057
1058        unistd::close(fd)?;
1059        Ok(())
1060    }
1061
1062    #[test]
1063    #[serial]
1064    fn test_close_range_native() -> Result<()> {
1065        let fd = fcntl::open("/dev/null", fcntl::OFlag::O_RDWR, sys::stat::Mode::empty())?;
1066        let syscall = LinuxSyscall {};
1067        syscall
1068            .close_range(0)
1069            .context("failed to clean up the fds")?;
1070
1071        let fd_flag = fcntl::fcntl(fd, fcntl::F_GETFD)?;
1072        if (fd_flag & fcntl::FdFlag::FD_CLOEXEC.bits()) == 0 {
1073            bail!("CLOEXEC flag is not set correctly");
1074        }
1075
1076        unistd::close(fd)?;
1077        Ok(())
1078    }
1079
1080    #[test]
1081    fn test_known_mount_options_implemented() -> Result<()> {
1082        for option in MountOption::known_options() {
1083            match MountOption::from_str(&option) {
1084                Ok(_) => {}
1085                Err(e) => bail!("failed to parse mount option: {}", e),
1086            }
1087        }
1088        Ok(())
1089    }
1090}