Skip to main content

libcontainer/rootfs/
mount.rs

1use std::fs::{Permissions, canonicalize};
2use std::io::{BufRead, BufReader, ErrorKind};
3use std::os::fd::{AsFd, OwnedFd};
4use std::os::unix::fs::{MetadataExt, PermissionsExt};
5use std::path::{Path, PathBuf};
6use std::time::Duration;
7#[cfg(feature = "v1")]
8use std::{borrow::Cow, collections::HashMap};
9use std::{fs, mem};
10
11use libcgroups::common::CgroupSetup::{Hybrid, Legacy, Unified};
12#[cfg(feature = "v1")]
13use libcgroups::common::DEFAULT_CGROUP_ROOT;
14use nix::NixPath;
15use nix::errno::Errno;
16use nix::mount::MsFlags;
17use nix::sys::statfs::{PROC_SUPER_MAGIC, statfs};
18use oci_spec::runtime::{Mount as SpecMount, MountBuilder as SpecMountBuilder};
19use pathrs::Root;
20use pathrs::flags::OpenFlags;
21use pathrs::procfs::{ProcfsBase, ProcfsHandle};
22#[cfg(feature = "v1")]
23use procfs::process::Process;
24use procfs::process::{MountInfo, MountOptFields};
25use procfs::{FromRead, ProcessCGroups};
26
27#[cfg(feature = "v1")]
28use super::symlink::Symlink;
29use super::symlink::SymlinkError;
30use super::utils::{MountOptionConfig, parse_mount};
31use crate::syscall::syscall::create_syscall;
32use crate::syscall::{Syscall, SyscallError, linux};
33use crate::utils::{PathBufExt, retry};
34
35const MAX_EBUSY_MOUNT_ATTEMPTS: u32 = 3;
36// runc has a retry interval of 100ms. We are following this.
37// https://github.com/opencontainers/runc/blob/v1.3.0/libcontainer/rootfs_linux.go#L1235
38#[cfg(not(test))]
39const MOUNT_RETRY_DELAY_MS: u64 = 100;
40// In tests, there is no need to delay, so set it to 0ms.
41#[cfg(test)]
42const MOUNT_RETRY_DELAY_MS: u64 = 0;
43
44#[derive(Debug, thiserror::Error)]
45pub enum MountError {
46    #[error("no source in mount spec")]
47    NoSource,
48    #[error("io error")]
49    Io(#[from] std::io::Error),
50    #[error("syscall")]
51    Syscall(#[from] crate::syscall::SyscallError),
52    #[error("nix error")]
53    Nix(#[from] nix::Error),
54    #[error("failed to build oci spec")]
55    SpecBuild(#[from] oci_spec::OciSpecError),
56    #[error(transparent)]
57    Other(Box<dyn std::error::Error + Send + Sync>),
58    #[error("{0}")]
59    Custom(String),
60    #[error("symlink")]
61    Symlink(#[from] SymlinkError),
62    #[error("procfs failed")]
63    Procfs(#[from] procfs::ProcError),
64    #[error("unknown mount option: {0}")]
65    UnsupportedMountOption(String),
66    #[error(transparent)]
67    Pathrs(#[from] pathrs::error::Error),
68}
69
70type Result<T> = std::result::Result<T, MountError>;
71
72pub trait MountInfoProvider {
73    fn mountinfo(&self) -> Result<Vec<MountInfo>>;
74}
75
76/// Default provider that reads mountinfo from /proc via procfs.
77pub struct ProcMountInfoProvider;
78
79impl ProcMountInfoProvider {
80    pub fn new() -> Self {
81        ProcMountInfoProvider
82    }
83}
84
85impl MountInfoProvider for ProcMountInfoProvider {
86    fn mountinfo(&self) -> Result<Vec<MountInfo>> {
87        let reader = BufReader::new(ProcfsHandle::new()?.open(
88            ProcfsBase::ProcSelf,
89            "mountinfo",
90            OpenFlags::O_RDONLY | OpenFlags::O_CLOEXEC,
91        )?);
92
93        let mount_infos: Vec<MountInfo> = reader
94            .lines()
95            .map(|lr| {
96                lr.map_err(MountError::from)
97                    .and_then(|s| MountInfo::from_line(&s).map_err(MountError::from))
98            })
99            .collect::<Result<_>>()?;
100        Ok(mount_infos)
101    }
102}
103
104#[derive(Debug)]
105pub struct MountOptions<'a> {
106    pub root: &'a Path,
107    pub label: Option<&'a str>,
108    #[allow(dead_code)]
109    pub cgroup_ns: bool,
110}
111
112pub struct Mount {
113    syscall: Box<dyn Syscall>,
114    mountinfo_provider: Box<dyn MountInfoProvider>,
115}
116
117impl Default for Mount {
118    fn default() -> Self {
119        Self::new()
120    }
121}
122
123impl Mount {
124    pub fn new() -> Mount {
125        Mount {
126            syscall: create_syscall(),
127            mountinfo_provider: Box::new(ProcMountInfoProvider::new()),
128        }
129    }
130
131    pub fn with_mountinfo_provider<P: MountInfoProvider + 'static>(mut self, provider: P) -> Self {
132        self.mountinfo_provider = Box::new(provider);
133        self
134    }
135
136    pub fn setup_mount(&self, mount: &SpecMount, options: &MountOptions) -> Result<()> {
137        tracing::debug!("mounting {:?}", mount);
138        let mut mount_option_config = parse_mount(mount)?;
139
140        match mount.typ().as_deref() {
141            Some("cgroup") => {
142                let cgroup_setup = libcgroups::common::get_cgroup_setup().map_err(|err| {
143                    tracing::error!("failed to determine cgroup setup: {}", err);
144                    MountError::Other(err.into())
145                })?;
146                match cgroup_setup {
147                    Legacy | Hybrid => {
148                        #[cfg(not(feature = "v1"))]
149                        panic!(
150                            "libcontainer can't run in a Legacy or Hybrid cgroup setup without the v1 feature"
151                        );
152                        #[cfg(feature = "v1")]
153                        self.mount_cgroup_v1(mount, options).map_err(|err| {
154                            tracing::error!("failed to mount cgroup v1: {}", err);
155                            err
156                        })?
157                    }
158                    Unified => {
159                        #[cfg(not(feature = "v2"))]
160                        panic!(
161                            "libcontainer can't run in a Unified cgroup setup without the v2 feature"
162                        );
163                        #[cfg(feature = "v2")]
164                        self.mount_cgroup_v2(mount, options, &mount_option_config)
165                            .map_err(|err| {
166                                tracing::error!("failed to mount cgroup v2: {}", err);
167                                err
168                            })?
169                    }
170                }
171            }
172            // procfs and sysfs are special because we need to ensure they are actually
173            // mounted on a specific path in a container without any funny business.
174            // Ref: https://github.com/opencontainers/runc/security/advisories/GHSA-fh74-hm69-rqjw
175            Some(typ @ ("proc" | "sysfs")) => {
176                let dest_path = options
177                    .root
178                    .join_safely(Path::new(mount.destination()).normalize())
179                    .map_err(|err| {
180                        tracing::error!(
181                            "could not join rootfs path with mount destination {:?}: {}",
182                            mount.destination(),
183                            err
184                        );
185                        MountError::Other(err.into())
186                    })?;
187
188                match fs::symlink_metadata(&dest_path) {
189                    Ok(m) if !m.is_dir() => {
190                        return Err(MountError::Other(
191                            format!("filesystem {} must be mounted on ordinary directory", typ)
192                                .into(),
193                        ));
194                    }
195                    Err(e) if e.kind() != ErrorKind::NotFound => {
196                        return Err(MountError::Other(
197                            format!("symlink_metadata failed for {}: {}", dest_path.display(), e)
198                                .into(),
199                        ));
200                    }
201                    _ => {}
202                }
203
204                self.check_proc_mount(options.root, mount)?;
205
206                self.mount_into_container(mount, options.root, &mount_option_config, options.label)
207                    .map_err(|err| {
208                        tracing::error!("failed to mount {:?}: {}", mount, err);
209                        err
210                    })?;
211            }
212            _ => {
213                if mount.destination() == Path::new("/dev") {
214                    mount_option_config.flags &= !MsFlags::MS_RDONLY;
215                    self.mount_into_container(
216                        mount,
217                        options.root,
218                        &mount_option_config,
219                        options.label,
220                    )
221                    .map_err(|err| {
222                        tracing::error!("failed to mount /dev: {}", err);
223                        err
224                    })?;
225                } else {
226                    self.mount_into_container(
227                        mount,
228                        options.root,
229                        &mount_option_config,
230                        options.label,
231                    )
232                    .map_err(|err| {
233                        tracing::error!("failed to mount {:?}: {}", mount, err);
234                        err
235                    })?;
236                }
237            }
238        }
239
240        Ok(())
241    }
242
243    #[cfg(feature = "v1")]
244    fn mount_cgroup_v1(&self, cgroup_mount: &SpecMount, options: &MountOptions) -> Result<()> {
245        tracing::debug!("mounting cgroup v1 filesystem");
246        // create tmpfs into which the cgroup subsystems will be mounted
247        let tmpfs = SpecMountBuilder::default()
248            .source("tmpfs")
249            .typ("tmpfs")
250            .destination(cgroup_mount.destination())
251            .options(
252                ["noexec", "nosuid", "nodev", "mode=755"]
253                    .iter()
254                    .map(|o| o.to_string())
255                    .collect::<Vec<String>>(),
256            )
257            .build()
258            .map_err(|err| {
259                tracing::error!("failed to build tmpfs for cgroup: {}", err);
260                err
261            })?;
262
263        self.setup_mount(&tmpfs, options).map_err(|err| {
264            tracing::error!("failed to mount tmpfs for cgroup: {}", err);
265            err
266        })?;
267
268        // get all cgroup mounts on the host system
269        let host_mounts: Vec<PathBuf> = libcgroups::v1::util::list_subsystem_mount_points()
270            .map_err(|err| {
271                tracing::error!("failed to get subsystem mount points: {}", err);
272                MountError::Other(err.into())
273            })?
274            .into_iter()
275            .filter(|p| p.as_path().starts_with(DEFAULT_CGROUP_ROOT))
276            .collect();
277        tracing::debug!("cgroup mounts: {:?}", host_mounts);
278
279        // get process cgroups
280        let ppid = std::os::unix::process::parent_id();
281        // The non-zero ppid means that the PID Namespace is not separated.
282        let ppid = if ppid == 0 { std::process::id() } else { ppid };
283        let root_cgroups = Process::new(ppid as i32)?.cgroups()?.0;
284        let process_cgroups: HashMap<String, String> =
285            ProcessCGroups::from_read(ProcfsHandle::new()?.open(
286                ProcfsBase::ProcSelf,
287                "cgroup",
288                OpenFlags::O_RDONLY | OpenFlags::O_CLOEXEC,
289            )?)?
290            .into_iter()
291            .map(|c| {
292                let hierarchy = c.hierarchy;
293                // When youki itself is running inside a container, the cgroup path
294                // will include the path of pid-1, which needs to be stripped before
295                // mounting.
296                let root_pathname = root_cgroups
297                    .iter()
298                    .find(|c| c.hierarchy == hierarchy)
299                    .map(|c| c.pathname.as_ref())
300                    .unwrap_or("");
301                let path = c
302                    .pathname
303                    .strip_prefix(root_pathname)
304                    .unwrap_or(&c.pathname);
305                (c.controllers.join(","), path.to_owned())
306            })
307            .collect();
308        tracing::debug!("Process cgroups: {:?}", process_cgroups);
309
310        let cgroup_root = options
311            .root
312            .join_safely(cgroup_mount.destination())
313            .map_err(|err| {
314                tracing::error!(
315                    "could not join rootfs path with cgroup mount destination: {}",
316                    err
317                );
318                MountError::Other(err.into())
319            })?;
320        tracing::debug!("cgroup root: {:?}", cgroup_root);
321
322        let symlink = Symlink::new();
323
324        // setup cgroup mounts for container
325        for host_mount in &host_mounts {
326            if let Some(subsystem_name) = host_mount.file_name().and_then(|n| n.to_str()) {
327                if options.cgroup_ns {
328                    self.setup_namespaced_subsystem(
329                        cgroup_mount,
330                        options,
331                        subsystem_name,
332                        subsystem_name == "systemd",
333                    )?;
334                } else {
335                    self.setup_emulated_subsystem(
336                        cgroup_mount,
337                        options,
338                        subsystem_name,
339                        subsystem_name == "systemd",
340                        host_mount,
341                        &process_cgroups,
342                    )?;
343                }
344
345                symlink.setup_comount_symlinks(&cgroup_root, subsystem_name)?;
346            } else {
347                tracing::warn!("could not get subsystem name from {:?}", host_mount);
348            }
349        }
350
351        Ok(())
352    }
353
354    // On some distros cgroup subsystems are comounted e.g. cpu,cpuacct or net_cls,net_prio. These systems
355    // have to be comounted in the container as well as the kernel will reject trying to mount them separately.
356    #[cfg(feature = "v1")]
357    fn setup_namespaced_subsystem(
358        &self,
359        cgroup_mount: &SpecMount,
360        options: &MountOptions,
361        subsystem_name: &str,
362        named: bool,
363    ) -> Result<()> {
364        tracing::debug!(
365            "Mounting (namespaced) {:?} cgroup subsystem",
366            subsystem_name
367        );
368        let subsystem_mount = SpecMountBuilder::default()
369            .source("cgroup")
370            .typ("cgroup")
371            .destination(cgroup_mount.destination().join(subsystem_name))
372            .options(
373                ["noexec", "nosuid", "nodev"]
374                    .iter()
375                    .map(|o| o.to_string())
376                    .collect::<Vec<String>>(),
377            )
378            .build()
379            .map_err(|err| {
380                tracing::error!("failed to build {subsystem_name} mount: {err}");
381                err
382            })?;
383
384        let data: Cow<str> = if named {
385            format!("name={subsystem_name}").into()
386        } else {
387            subsystem_name.into()
388        };
389
390        let mount_options_config = MountOptionConfig {
391            flags: MsFlags::MS_NOEXEC | MsFlags::MS_NOSUID | MsFlags::MS_NODEV,
392            data: vec![data.into_owned()],
393            rec_attr: None,
394        };
395
396        self.mount_into_container(
397            &subsystem_mount,
398            options.root,
399            &mount_options_config,
400            options.label,
401        )
402        .map_err(|err| {
403            tracing::error!("failed to mount {subsystem_mount:?}: {err}");
404            err
405        })
406    }
407
408    #[cfg(feature = "v1")]
409    fn setup_emulated_subsystem(
410        &self,
411        cgroup_mount: &SpecMount,
412        options: &MountOptions,
413        subsystem_name: &str,
414        named: bool,
415        host_mount: &Path,
416        process_cgroups: &HashMap<String, String>,
417    ) -> Result<()> {
418        tracing::debug!("Mounting (emulated) {:?} cgroup subsystem", subsystem_name);
419        let named_hierarchy: Cow<str> = if named {
420            format!("name={subsystem_name}").into()
421        } else {
422            subsystem_name.into()
423        };
424
425        if let Some(proc_path) = process_cgroups.get(named_hierarchy.as_ref()) {
426            let emulated = SpecMountBuilder::default()
427                .source(
428                    host_mount
429                        .join_safely(proc_path.as_str())
430                        .map_err(|err| {
431                            tracing::error!(
432                                "failed to join mount source for {subsystem_name} subsystem: {}",
433                                err
434                            );
435                            MountError::Other(err.into())
436                        })?,
437                )
438                .destination(
439                    cgroup_mount
440                        .destination()
441                        .join_safely(subsystem_name)
442                        .map_err(|err| {
443                            tracing::error!(
444                                "failed to join mount destination for {subsystem_name} subsystem: {}",
445                                err
446                            );
447                            MountError::Other(err.into())
448                        })?,
449                )
450                .typ("bind")
451                .options(
452                    ["rw", "rbind"]
453                        .iter()
454                        .map(|o| o.to_string())
455                        .collect::<Vec<String>>(),
456                )
457                .build()?;
458            tracing::debug!("Mounting emulated cgroup subsystem: {:?}", emulated);
459
460            self.setup_mount(&emulated, options).map_err(|err| {
461                tracing::error!("failed to mount {subsystem_name} cgroup hierarchy: {}", err);
462                err
463            })?;
464        } else {
465            tracing::warn!("Could not mount {:?} cgroup subsystem", subsystem_name);
466        }
467
468        Ok(())
469    }
470
471    #[cfg(feature = "v2")]
472    fn mount_cgroup_v2(
473        &self,
474        cgroup_mount: &SpecMount,
475        options: &MountOptions,
476        mount_option_config: &MountOptionConfig,
477    ) -> Result<()> {
478        tracing::debug!("Mounting cgroup v2 filesystem");
479
480        let cgroup_mount = SpecMountBuilder::default()
481            .typ("cgroup2")
482            .source("cgroup")
483            .destination(cgroup_mount.destination())
484            .options(Vec::new())
485            .build()?;
486        tracing::debug!("{:?}", cgroup_mount);
487
488        if self
489            .mount_into_container(
490                &cgroup_mount,
491                options.root,
492                mount_option_config,
493                options.label,
494            )
495            .is_err()
496        {
497            let host_mount = libcgroups::v2::util::get_unified_mount_point().map_err(|err| {
498                tracing::error!("failed to get unified mount point: {}", err);
499                MountError::Other(err.into())
500            })?;
501
502            let process_cgroup = ProcessCGroups::from_read(ProcfsHandle::new()?.open(
503                ProcfsBase::ProcSelf,
504                "cgroup",
505                OpenFlags::O_RDONLY | OpenFlags::O_CLOEXEC,
506            )?)?
507            .into_iter()
508            .find(|c| c.hierarchy == 0)
509            .map(|c| PathBuf::from(c.pathname))
510            .ok_or_else(|| MountError::Custom("failed to find unified process cgroup".into()))?;
511
512            let bind_mount = SpecMountBuilder::default()
513                .typ("bind")
514                .source(host_mount.join_safely(process_cgroup).map_err(|err| {
515                    tracing::error!("failed to join host mount for cgroup hierarchy: {}", err);
516                    MountError::Other(err.into())
517                })?)
518                .destination(cgroup_mount.destination())
519                .options(Vec::new())
520                .build()
521                .map_err(|err| {
522                    tracing::error!("failed to build cgroup bind mount: {}", err);
523                    err
524                })?;
525            tracing::debug!("{:?}", bind_mount);
526
527            let mut mount_option_config = (*mount_option_config).clone();
528            mount_option_config.flags |= MsFlags::MS_BIND;
529            self.mount_into_container(
530                &bind_mount,
531                options.root,
532                &mount_option_config,
533                options.label,
534            )
535            .map_err(|err| {
536                tracing::error!("failed to bind mount cgroup hierarchy: {}", err);
537                err
538            })?;
539        }
540
541        Ok(())
542    }
543
544    /// Make parent mount of rootfs private if it was shared, which is required by pivot_root.
545    /// It also makes sure following bind mount does not propagate in other namespaces.
546    pub fn make_parent_mount_private(&self, rootfs: &Path) -> Result<MountInfo> {
547        let mount_infos = self.mountinfo_provider.mountinfo()?;
548        let parent_mount = find_parent_mount(rootfs, mount_infos)?;
549
550        // check parent mount has 'shared' propagation type
551        if parent_mount
552            .opt_fields
553            .iter()
554            .any(|field| matches!(field, MountOptFields::Shared(_)))
555        {
556            self.syscall.mount(
557                None,
558                &parent_mount.mount_point,
559                None,
560                MsFlags::MS_PRIVATE,
561                None,
562            )?;
563        }
564        Ok(parent_mount)
565    }
566
567    fn mount_into_container(
568        &self,
569        m: &SpecMount,
570        rootfs: &Path,
571        mount_option_config: &MountOptionConfig,
572        label: Option<&str>,
573    ) -> Result<()> {
574        let typ = m.typ().as_deref();
575        let mut data_options = mount_option_config.data.clone();
576
577        if let Some(l) = label {
578            if typ != Some("proc") && typ != Some("sysfs") {
579                if Path::new("/sys/fs/selinux").exists() {
580                    data_options.push(format!("context={}", l));
581                } else {
582                    tracing::debug!("ignoring mount label because SELinux is disabled");
583                }
584            }
585        }
586
587        let root = Root::open(rootfs)?;
588        let container_dest = m.destination();
589
590        let source = m.source().as_ref().ok_or(MountError::NoSource)?;
591        let dir_perm = Permissions::from_mode(0o755);
592        let src = if typ == Some("bind") {
593            let src = canonicalize(source).map_err(|err| {
594                tracing::error!("failed to canonicalize {:?}: {}", source, err);
595                err
596            })?;
597
598            if src.is_dir() {
599                root.mkdir_all(container_dest, &dir_perm)?;
600            } else {
601                let parent = container_dest
602                    .parent()
603                    .ok_or(MountError::Custom("destination has no parent".to_string()))?;
604                root.mkdir_all(parent, &dir_perm)?;
605
606                match root.create_file(
607                    container_dest,
608                    OpenFlags::O_EXCL
609                        | OpenFlags::O_CREAT
610                        | OpenFlags::O_NOFOLLOW
611                        | OpenFlags::O_CLOEXEC,
612                    &Permissions::from_mode(0o644),
613                ) {
614                    Ok(_) => Ok(()),
615                    // If we get here, the file is already present, so continue.
616                    Err(create_err) => root
617                        .resolve(container_dest)
618                        .map(|_| ())
619                        .map_err(|_| create_err),
620                }?;
621            };
622
623            src
624        } else {
625            root.mkdir_all(container_dest, &dir_perm)?;
626            PathBuf::from(source)
627        };
628
629        let dest: OwnedFd = root.resolve(container_dest)?.into();
630        let dest_fd = dest.as_fd();
631
632        let is_bind = typ == Some("bind")
633            || m.options()
634                .as_deref()
635                .is_some_and(|ops| ops.iter().any(|o| o == "bind" || o == "rbind"));
636
637        // fd-based mount flow:
638        // - bind: open_tree -> mount_setattr -> move_mount
639        // - nonbind: fsopen -> fsconfig -> fsmount -> mount_setattr -> move_mount
640        if is_bind {
641            let recursive = m
642                .options()
643                .as_ref()
644                .map(|v| v.iter().any(|o| o == "rbind"))
645                .unwrap_or(false);
646            let mut open_tree_flags: libc::c_uint = (libc::OPEN_TREE_CLOEXEC as libc::c_uint)
647                | (libc::OPEN_TREE_CLONE as libc::c_uint)
648                | (libc::AT_EMPTY_PATH as libc::c_uint);
649            if recursive {
650                open_tree_flags |= libc::AT_RECURSIVE as libc::c_uint;
651            };
652
653            let src_str = src.to_str().ok_or(SyscallError::Nix(Errno::EINVAL))?;
654            let mount_fd_owned =
655                self.syscall
656                    .open_tree(libc::AT_FDCWD, Some(src_str), open_tree_flags)?;
657            let mount_fd = mount_fd_owned.as_fd();
658
659            // mount_setattr
660            let attr_set_from_flags = self.mount_flag_to_attr(&mount_option_config.flags);
661            let mut mount_attr = linux::MountAttr {
662                attr_set: 0,
663                attr_clr: 0,
664                propagation: 0,
665                userns_fd: 0,
666            };
667            mount_attr.attr_set |= attr_set_from_flags;
668
669            self.apply_atime_from_msflags(
670                &mut mount_attr,
671                attr_set_from_flags,
672                mount_option_config.flags,
673            );
674
675            self.syscall.mount_setattr(
676                mount_fd,
677                Path::new(""),
678                linux::AT_EMPTY_PATH,
679                &mount_attr,
680                mem::size_of::<linux::MountAttr>(),
681            )?;
682
683            // rec_attr is applied recursively
684            if let Some(rec_attr) = &mount_option_config.rec_attr {
685                self.syscall.mount_setattr(
686                    mount_fd,
687                    Path::new(""),
688                    linux::AT_EMPTY_PATH | linux::AT_RECURSIVE,
689                    rec_attr,
690                    mem::size_of::<linux::MountAttr>(),
691                )?;
692            }
693
694            // move_mount
695            self.syscall.move_mount(
696                mount_fd,
697                None,
698                dest_fd,
699                None,
700                linux::MOVE_MOUNT_T_EMPTY_PATH | linux::MOVE_MOUNT_F_EMPTY_PATH,
701            )?;
702        } else {
703            let mount_fn = || -> std::result::Result<(), SyscallError> {
704                // fsopen
705                let fsfd_owned = self.syscall.fsopen(typ, 0)?;
706                let fsfd = fsfd_owned.as_fd();
707
708                // fsconfig
709                let src_str = src
710                    .as_os_str()
711                    .to_str()
712                    .ok_or(SyscallError::Nix(Errno::EINVAL))?;
713                self.syscall.fsconfig(
714                    fsfd,
715                    linux::FSCONFIG_SET_STRING as u32,
716                    Some("source"),
717                    Some(src_str),
718                    0,
719                )?;
720
721                for opt in data_options.iter().filter(|s| !s.is_empty()) {
722                    if let Some((k, v)) = opt.split_once('=') {
723                        self.syscall.fsconfig(
724                            fsfd,
725                            linux::FSCONFIG_SET_STRING as u32,
726                            Some(k),
727                            Some(v),
728                            0,
729                        )?;
730                    } else {
731                        self.syscall.fsconfig(
732                            fsfd,
733                            linux::FSCONFIG_SET_FLAG as u32,
734                            Some(opt),
735                            None,
736                            0,
737                        )?;
738                    };
739                }
740
741                self.syscall
742                    .fsconfig(fsfd, linux::FSCONFIG_CMD_CREATE as u32, None, None, 0)?;
743
744                // fsmount
745                let mount_fd_owned = self.syscall.fsmount(fsfd, 0, None)?;
746                let mount_fd = mount_fd_owned.as_fd();
747
748                // mount_setattr
749                let attr_set_from_flags = self.mount_flag_to_attr(&mount_option_config.flags);
750                let mut mount_attr = linux::MountAttr {
751                    attr_set: 0,
752                    attr_clr: 0,
753                    propagation: 0,
754                    userns_fd: 0,
755                };
756                mount_attr.attr_set |= attr_set_from_flags;
757
758                self.apply_atime_from_msflags(
759                    &mut mount_attr,
760                    attr_set_from_flags,
761                    mount_option_config.flags,
762                );
763
764                self.syscall.mount_setattr(
765                    mount_fd,
766                    Path::new(""),
767                    linux::AT_EMPTY_PATH,
768                    &mount_attr,
769                    mem::size_of::<linux::MountAttr>(),
770                )?;
771
772                // rec_attr is applied recursively
773                if let Some(rec_attr) = &mount_option_config.rec_attr {
774                    self.syscall.mount_setattr(
775                        mount_fd,
776                        Path::new(""),
777                        linux::AT_EMPTY_PATH | linux::AT_RECURSIVE,
778                        rec_attr,
779                        mem::size_of::<linux::MountAttr>(),
780                    )?;
781                }
782
783                // move_mount
784                self.syscall.move_mount(
785                    mount_fd,
786                    None,
787                    dest_fd,
788                    None,
789                    linux::MOVE_MOUNT_T_EMPTY_PATH | linux::MOVE_MOUNT_F_EMPTY_PATH,
790                )?;
791                Ok(())
792            };
793
794            match mount_fn() {
795                Ok(()) => {}
796                Err(SyscallError::Nix(nix::Error::EINVAL)) => {
797                    mount_fn()?;
798                }
799                Err(SyscallError::Nix(nix::Error::EBUSY)) => {
800                    let delay = Duration::from_millis(MOUNT_RETRY_DELAY_MS);
801                    let retry_policy =
802                        |err: &SyscallError| matches!(err, SyscallError::Nix(Errno::EBUSY));
803                    retry(mount_fn, MAX_EBUSY_MOUNT_ATTEMPTS - 1, delay, retry_policy)?;
804                }
805                Err(e) => return Err(e.into()),
806            }
807        }
808
809        Ok(())
810    }
811
812    // https://man7.org/linux/man-pages/man2/mount_setattr.2.html
813    // To apply MsFlags via mount_setattr, we set the corresponding bits in attr_set
814    fn mount_flag_to_attr(&self, flags: &MsFlags) -> u64 {
815        const MAP_SET: &[(MsFlags, u64)] = &[
816            (MsFlags::MS_RDONLY, linux::MOUNT_ATTR_RDONLY),
817            (MsFlags::MS_NOSUID, linux::MOUNT_ATTR_NOSUID),
818            (MsFlags::MS_NODEV, linux::MOUNT_ATTR_NODEV),
819            (MsFlags::MS_NOEXEC, linux::MOUNT_ATTR_NOEXEC),
820            (MsFlags::MS_NOATIME, linux::MOUNT_ATTR_NOATIME),
821            (MsFlags::MS_NODIRATIME, linux::MOUNT_ATTR_NODIRATIME),
822            (MsFlags::MS_RELATIME, linux::MOUNT_ATTR_RELATIME),
823            (MsFlags::MS_STRICTATIME, linux::MOUNT_ATTR_STRICTATIME),
824        ];
825
826        let mut set = 0;
827        for (ms, attr) in MAP_SET {
828            if flags.intersects(*ms) {
829                set |= *attr;
830            }
831        }
832        set
833    }
834
835    // Apply atime-related configuration.
836    // https://man7.org/linux/man-pages/man2/mount_setattr.2.html
837    // ref: MOUNT_ATTR__ATIME
838    fn apply_atime_from_msflags(
839        &self,
840        mount_attr: &mut linux::MountAttr,
841        attr_set_from_flags: u64,
842        msflags: MsFlags,
843    ) {
844        let atime_bits =
845            linux::MOUNT_ATTR_NOATIME | linux::MOUNT_ATTR_STRICTATIME | linux::MOUNT_ATTR_RELATIME;
846
847        let noatime = msflags.contains(MsFlags::MS_NOATIME);
848        let strictatime = msflags.contains(MsFlags::MS_STRICTATIME);
849        let relatime = msflags.contains(MsFlags::MS_RELATIME);
850
851        let atime = if strictatime {
852            linux::MOUNT_ATTR_STRICTATIME
853        } else if noatime {
854            linux::MOUNT_ATTR_NOATIME
855        } else if relatime {
856            linux::MOUNT_ATTR_RELATIME
857        } else {
858            0
859        };
860
861        let non_atime = attr_set_from_flags & !atime_bits;
862
863        if atime != 0 {
864            mount_attr.attr_clr |= linux::MOUNT_ATTR__ATIME;
865            mount_attr.attr_set |= non_atime | atime;
866        } else {
867            mount_attr.attr_set |= non_atime;
868        }
869    }
870
871    /// check_proc_mount checks to ensure that the mount destination is not over the top of /proc.
872    /// dest is required to be an abs path and have any symlinks resolved before calling this function.
873    /// # Example  (a valid case where `/proc` is mounted with `proc` type.)
874    ///
875    /// ```
876    /// use std::path::PathBuf;
877    /// use oci_spec::runtime::MountBuilder as SpecMountBuilder;
878    /// use libcontainer::rootfs::Mount;
879    ///
880    /// let mounter = Mount::new();
881    ///
882    /// let rootfs = PathBuf::from("/var/lib/my-runtime/containers/abcd1234/rootfs");
883    /// let destination = PathBuf::from("/proc");
884    /// let source = PathBuf::from("proc");
885    /// let typ = "proc";
886    ///
887    /// let mount = SpecMountBuilder::default()
888    ///     .destination(destination)
889    ///     .typ(typ)
890    ///     .source(source)
891    ///     .build()
892    ///     .expect("failed to build SpecMount");
893    ///
894    /// assert!(mounter.check_proc_mount(rootfs.as_path(), &mount).is_ok());
895    /// ```
896    /// # Example (bind mount to `/proc` that should fail)
897    /// ```
898    /// use std::path::PathBuf;
899    /// use oci_spec::runtime::MountBuilder as SpecMountBuilder;
900    /// use libcontainer::rootfs::Mount;
901    ///
902    /// let mounter = Mount::new();
903    ///
904    /// let rootfs = PathBuf::from("/var/lib/my-runtime/containers/abcd1234/rootfs");
905    /// let destination = PathBuf::from("/proc");
906    /// let source = PathBuf::from("/tmp");
907    /// let typ = "bind";
908    ///
909    /// let mount = SpecMountBuilder::default()
910    ///     .destination(destination)
911    ///     .typ(typ)
912    ///     .source(source)
913    ///     .build()
914    ///     .expect("failed to build SpecMount");
915    ///
916    /// assert!(mounter.check_proc_mount(rootfs.as_path(), &mount).is_err());
917    /// ```
918    pub fn check_proc_mount(&self, rootfs: &Path, mount: &SpecMount) -> Result<()> {
919        const PROC_ROOT_INO: u64 = 1;
920        const VALID_PROC_MOUNTS: &[&str] = &[
921            "/proc/cpuinfo",
922            "/proc/diskstats",
923            "/proc/meminfo",
924            "/proc/stat",
925            "/proc/swaps",
926            "/proc/uptime",
927            "/proc/loadavg",
928            "/proc/slabinfo",
929            "/proc/sys/kernel/ns_last_pid",
930            "/proc/sys/crypto/fips_enabled",
931        ];
932
933        let dest = mount.destination();
934
935        let container_proc_path = rootfs.join("proc");
936        let dest_path = rootfs.join_safely(dest).map_err(|err| {
937            tracing::error!(
938                "could not join rootfs path with mount destination {:?}: {}",
939                dest,
940                err
941            );
942            MountError::Other(err.into())
943        })?;
944
945        // If path is Ok, it means dest_path is under /proc.
946        // - Ok(p) with p.is_empty(): mount target is exactly /proc.
947        //   In this case, check if the mount source is procfs.
948        // - Ok(p) with !p.is_empty(): mount target is under /proc.
949        //   Only allow if it matches a specific whitelist of proc entries.
950        // - Err: not under /proc, so no further checks are needed
951        let path = dest_path.strip_prefix(&container_proc_path);
952
953        match path {
954            Err(_) => Ok(()),
955            Ok(p) if p.as_os_str().is_empty() => {
956                if mount.typ().as_deref() == Some("proc") {
957                    return Ok(());
958                }
959
960                if mount.typ().as_deref() == Some("bind") {
961                    if let Some(source) = mount.source() {
962                        let stat = statfs(source).map_err(MountError::from)?;
963                        if stat.filesystem_type() == PROC_SUPER_MAGIC {
964                            let meta = fs::metadata(source).map_err(MountError::from)?;
965                            // Follow the behavior of runc's checkProcMount function.
966                            if meta.ino() != PROC_ROOT_INO {
967                                tracing::warn!(
968                                    "bind-mount {} (source {:?}) is of type procfs but not the root (inode {}). \
969                                    Future versions may reject this.",
970                                    dest.display(),
971                                    mount.source(),
972                                    meta.ino()
973                                );
974                            }
975                            return Ok(());
976                        }
977                    }
978                }
979
980                Err(MountError::Custom(format!(
981                    "{} cannot be mounted because it is not type proc",
982                    dest.display()
983                )))
984            }
985            Ok(_) => {
986                // Here dest is definitely under /proc. Do not allow those,
987                // except for a few specific entries emulated by lxcfs.
988                let is_allowed = VALID_PROC_MOUNTS.iter().any(|allowed_path| {
989                    let container_allowed_path = rootfs.join(allowed_path.trim_start_matches('/'));
990                    dest_path == container_allowed_path
991                });
992
993                if is_allowed {
994                    Ok(())
995                } else {
996                    Err(MountError::Other(
997                        format!("{} is not a valid mount under /proc", dest.display()).into(),
998                    ))
999                }
1000            }
1001        }
1002    }
1003}
1004
1005/// Find parent mount of rootfs in given mount infos
1006pub fn find_parent_mount(
1007    rootfs: &Path,
1008    mount_infos: Vec<MountInfo>,
1009) -> std::result::Result<MountInfo, MountError> {
1010    // find the longest mount point
1011    let parent_mount_info = mount_infos
1012        .into_iter()
1013        .filter(|mi| rootfs.starts_with(&mi.mount_point))
1014        .max_by(|mi1, mi2| mi1.mount_point.len().cmp(&mi2.mount_point.len()))
1015        .ok_or_else(|| {
1016            MountError::Custom(format!("can't find the parent mount of {:?}", rootfs))
1017        })?;
1018    Ok(parent_mount_info)
1019}
1020
1021#[cfg(test)]
1022mod tests {
1023    #[cfg(feature = "v1")]
1024    use std::fs;
1025    use std::fs::OpenOptions;
1026    use std::os::unix::fs::symlink;
1027    use std::os::unix::net::UnixListener;
1028    use std::str::FromStr;
1029
1030    use anyhow::{Context, Ok, Result};
1031
1032    use super::*;
1033    use crate::syscall::test::{ArgName, MountArgs, TestHelperSyscall};
1034
1035    #[test]
1036    #[ignore] // TODO: fix fd-based test
1037    fn test_mount_into_container() -> Result<()> {
1038        let tmp_dir = tempfile::tempdir()?;
1039        {
1040            let m = Mount::new();
1041            let mount = &SpecMountBuilder::default()
1042                .destination(PathBuf::from("/dev/pts"))
1043                .typ("devpts")
1044                .source(PathBuf::from("devpts"))
1045                .options(vec![
1046                    "nosuid".to_string(),
1047                    "noexec".to_string(),
1048                    "newinstance".to_string(),
1049                    "ptmxmode=0666".to_string(),
1050                    "mode=0620".to_string(),
1051                    "gid=5".to_string(),
1052                ])
1053                .build()?;
1054            let mount_option_config = parse_mount(mount)?;
1055
1056            assert!(
1057                m.mount_into_container(
1058                    mount,
1059                    tmp_dir.path(),
1060                    &mount_option_config,
1061                    Some("defaults")
1062                )
1063                .is_ok()
1064            );
1065
1066            let want = vec![MountArgs {
1067                source: Some(PathBuf::from("devpts")),
1068                target: tmp_dir.path().join("dev/pts"),
1069                fstype: Some("devpts".to_string()),
1070                flags: MsFlags::MS_NOSUID | MsFlags::MS_NOEXEC,
1071                data: Some(
1072                    "newinstance,ptmxmode=0666,mode=0620,gid=5,context=\"defaults\"".to_string(),
1073                ),
1074            }];
1075            let got = &m
1076                .syscall
1077                .as_any()
1078                .downcast_ref::<TestHelperSyscall>()
1079                .unwrap()
1080                .get_mount_args();
1081            assert_eq!(want, *got);
1082            assert_eq!(got.len(), 1);
1083        }
1084        {
1085            let m = Mount::new();
1086            let mount = &SpecMountBuilder::default()
1087                .destination(PathBuf::from("/dev/null"))
1088                .typ("bind")
1089                .source(tmp_dir.path().join("null"))
1090                .options(vec!["ro".to_string()])
1091                .build()?;
1092            let mount_option_config = parse_mount(mount)?;
1093            OpenOptions::new()
1094                .create(true)
1095                .truncate(true)
1096                .write(true)
1097                .open(tmp_dir.path().join("null"))?;
1098
1099            assert!(
1100                m.mount_into_container(mount, tmp_dir.path(), &mount_option_config, None)
1101                    .is_ok()
1102            );
1103
1104            let want = vec![
1105                MountArgs {
1106                    source: Some(tmp_dir.path().join("null")),
1107                    target: tmp_dir.path().join("dev/null"),
1108                    fstype: Some("bind".to_string()),
1109                    flags: MsFlags::MS_RDONLY,
1110                    data: Some("".to_string()),
1111                },
1112                // remount one
1113                MountArgs {
1114                    source: None,
1115                    target: tmp_dir.path().join("dev/null"),
1116                    fstype: None,
1117                    flags: MsFlags::MS_RDONLY | MsFlags::MS_REMOUNT,
1118                    data: None,
1119                },
1120            ];
1121            let got = &m
1122                .syscall
1123                .as_any()
1124                .downcast_ref::<TestHelperSyscall>()
1125                .unwrap()
1126                .get_mount_args();
1127            assert_eq!(want, *got);
1128            assert_eq!(got.len(), 2);
1129        }
1130        {
1131            // Socket file bind-mount
1132            // https://github.com/youki-dev/youki/issues/3483
1133            let m = Mount::new();
1134            let mount = &SpecMountBuilder::default()
1135                .destination(PathBuf::from("/tmp.sock"))
1136                .typ("bind")
1137                .source(tmp_dir.path().join("tmp.sock"))
1138                .build()?;
1139            let mount_option_config = parse_mount(mount)?;
1140            UnixListener::bind(tmp_dir.path().join("tmp.sock"))?;
1141
1142            assert!(
1143                m.mount_into_container(mount, tmp_dir.path(), &mount_option_config, None)
1144                    .is_ok()
1145            );
1146
1147            let want = vec![MountArgs {
1148                source: Some(tmp_dir.path().join("tmp.sock")),
1149                target: tmp_dir.path().join("tmp.sock"),
1150                fstype: Some("bind".to_string()),
1151                flags: MsFlags::empty(),
1152                data: Some("".to_string()),
1153            }];
1154            let got = &m
1155                .syscall
1156                .as_any()
1157                .downcast_ref::<TestHelperSyscall>()
1158                .unwrap()
1159                .get_mount_args();
1160            assert_eq!(want, *got);
1161            assert_eq!(got.len(), 2);
1162        }
1163        {
1164            let m = Mount::new();
1165            let mount = &SpecMountBuilder::default()
1166                .destination(PathBuf::from("/tmp/retry"))
1167                .typ("tmpfs")
1168                .source(PathBuf::from("tmpfs"))
1169                .build()?;
1170            let mount_option_config = parse_mount(mount)?;
1171
1172            let syscall = m
1173                .syscall
1174                .as_any()
1175                .downcast_ref::<TestHelperSyscall>()
1176                .unwrap();
1177            syscall.set_ret_err(ArgName::Mount, || {
1178                Err(crate::syscall::SyscallError::Nix(nix::errno::Errno::EINVAL))
1179            });
1180            syscall.set_ret_err_times(ArgName::Mount, 1);
1181
1182            assert!(
1183                m.mount_into_container(mount, tmp_dir.path(), &mount_option_config, None)
1184                    .is_ok()
1185            );
1186            assert_eq!(syscall.get_mount_args().len(), 1);
1187        }
1188        {
1189            let m = Mount::new();
1190            let mount = &SpecMountBuilder::default()
1191                .destination(PathBuf::from("/tmp/retry"))
1192                .typ("tmpfs")
1193                .source(PathBuf::from("tmpfs"))
1194                .build()?;
1195            let mount_option_config = parse_mount(mount)?;
1196
1197            let syscall = m
1198                .syscall
1199                .as_any()
1200                .downcast_ref::<TestHelperSyscall>()
1201                .unwrap();
1202            syscall.set_ret_err(ArgName::Mount, || {
1203                Err(crate::syscall::SyscallError::Nix(nix::errno::Errno::EINVAL))
1204            });
1205            syscall.set_ret_err_times(ArgName::Mount, 2);
1206
1207            assert!(
1208                m.mount_into_container(mount, tmp_dir.path(), &mount_option_config, None)
1209                    .is_err()
1210            );
1211            assert_eq!(syscall.get_mount_args().len(), 0);
1212        }
1213        {
1214            let m = Mount::new();
1215            let mount = &SpecMountBuilder::default()
1216                .destination(PathBuf::from("/tmp/retry"))
1217                .typ("tmpfs")
1218                .source(PathBuf::from("tmpfs"))
1219                .build()?;
1220            let mount_option_config = parse_mount(mount)?;
1221
1222            let syscall = m
1223                .syscall
1224                .as_any()
1225                .downcast_ref::<TestHelperSyscall>()
1226                .unwrap();
1227            syscall.set_ret_err(ArgName::Mount, || {
1228                Err(crate::syscall::SyscallError::Nix(nix::errno::Errno::EBUSY))
1229            });
1230            syscall.set_ret_err_times(ArgName::Mount, MAX_EBUSY_MOUNT_ATTEMPTS as usize - 1);
1231
1232            assert!(
1233                m.mount_into_container(mount, tmp_dir.path(), &mount_option_config, None)
1234                    .is_ok()
1235            );
1236            assert_eq!(syscall.get_mount_args().len(), 1);
1237        }
1238        {
1239            let m = Mount::new();
1240            let mount = &SpecMountBuilder::default()
1241                .destination(PathBuf::from("/tmp/retry"))
1242                .typ("tmpfs")
1243                .source(PathBuf::from("tmpfs"))
1244                .build()?;
1245            let mount_option_config = parse_mount(mount)?;
1246
1247            let syscall = m
1248                .syscall
1249                .as_any()
1250                .downcast_ref::<TestHelperSyscall>()
1251                .unwrap();
1252            syscall.set_ret_err(ArgName::Mount, || {
1253                Err(crate::syscall::SyscallError::Nix(nix::errno::Errno::EBUSY))
1254            });
1255            syscall.set_ret_err_times(ArgName::Mount, MAX_EBUSY_MOUNT_ATTEMPTS as usize);
1256
1257            assert!(
1258                m.mount_into_container(mount, tmp_dir.path(), &mount_option_config, None)
1259                    .is_err()
1260            );
1261            assert_eq!(syscall.get_mount_args().len(), 0);
1262        }
1263
1264        Ok(())
1265    }
1266
1267    struct FakeMountInfo {
1268        entries: Vec<MountInfo>,
1269    }
1270    impl MountInfoProvider for FakeMountInfo {
1271        fn mountinfo(&self) -> std::result::Result<Vec<MountInfo>, MountError> {
1272            std::result::Result::Ok(self.entries.clone())
1273        }
1274    }
1275
1276    #[test]
1277    fn test_make_parent_mount_private() -> Result<()> {
1278        let tmp_dir = PathBuf::from_str("/tmp/mydir")?;
1279
1280        let parent = tmp_dir.as_path().parent().unwrap().to_path_buf();
1281        let fake = FakeMountInfo {
1282            entries: vec![MountInfo {
1283                mnt_id: 1,
1284                pid: 0,
1285                majmin: "".to_string(),
1286                root: "/".to_string(),
1287                mount_point: parent.clone(),
1288                mount_options: Default::default(),
1289                opt_fields: vec![MountOptFields::Shared(1)],
1290                fs_type: "tmpfs".to_string(),
1291                mount_source: None,
1292                super_options: Default::default(),
1293            }],
1294        };
1295
1296        let m = Mount::new().with_mountinfo_provider(fake);
1297        m.make_parent_mount_private(tmp_dir.as_path())?;
1298
1299        let set = m
1300            .syscall
1301            .as_any()
1302            .downcast_ref::<TestHelperSyscall>()
1303            .unwrap()
1304            .get_mount_args();
1305
1306        assert_eq!(set.len(), 1);
1307
1308        let got = &set[0];
1309        assert_eq!(got.source, None);
1310        assert_eq!(got.fstype, None);
1311        assert_eq!(got.flags, MsFlags::MS_PRIVATE);
1312        assert_eq!(got.data, None);
1313
1314        assert_eq!(got.target, parent);
1315
1316        Ok(())
1317    }
1318
1319    #[test]
1320    fn test_not_make_parent_mount_private_if_already_private() -> Result<()> {
1321        let tmp_dir = PathBuf::from_str("/tmp/mydir")?;
1322
1323        let parent = tmp_dir.as_path().parent().unwrap().to_path_buf();
1324        let fake = FakeMountInfo {
1325            entries: vec![MountInfo {
1326                mnt_id: 1,
1327                pid: 0,
1328                majmin: "".to_string(),
1329                root: "/".to_string(),
1330                mount_point: parent.clone(),
1331                mount_options: Default::default(),
1332                opt_fields: vec![],
1333                fs_type: "tmpfs".to_string(),
1334                mount_source: None,
1335                super_options: Default::default(),
1336            }],
1337        };
1338
1339        let m = Mount::new().with_mountinfo_provider(fake);
1340        m.make_parent_mount_private(tmp_dir.as_path())?;
1341
1342        let set = m
1343            .syscall
1344            .as_any()
1345            .downcast_ref::<TestHelperSyscall>()
1346            .unwrap()
1347            .get_mount_args();
1348
1349        assert_eq!(set.len(), 0);
1350
1351        Ok(())
1352    }
1353
1354    #[test]
1355    #[cfg(feature = "v1")]
1356    #[ignore] // TODO: fix fd-based test
1357    fn test_namespaced_subsystem_success() -> Result<()> {
1358        let tmp = tempfile::tempdir().unwrap();
1359        let container_cgroup = Path::new("/container_cgroup");
1360
1361        let mounter = Mount::new();
1362
1363        let spec_cgroup_mount = SpecMountBuilder::default()
1364            .destination(container_cgroup)
1365            .source("cgroup")
1366            .typ("cgroup")
1367            .build()
1368            .context("failed to build cgroup mount")?;
1369
1370        let mount_opts = MountOptions {
1371            root: tmp.path(),
1372            label: None,
1373            cgroup_ns: true,
1374        };
1375
1376        let subsystem_name = "cpu";
1377
1378        mounter
1379            .setup_namespaced_subsystem(&spec_cgroup_mount, &mount_opts, subsystem_name, false)
1380            .context("failed to setup namespaced subsystem")?;
1381
1382        let expected = MountArgs {
1383            source: Some(PathBuf::from("cgroup")),
1384            target: tmp
1385                .path()
1386                .join_safely(container_cgroup)?
1387                .join(subsystem_name),
1388            fstype: Some("cgroup".to_owned()),
1389            flags: MsFlags::MS_NOEXEC | MsFlags::MS_NOSUID | MsFlags::MS_NODEV,
1390            data: Some("cpu".to_owned()),
1391        };
1392
1393        let got = mounter
1394            .syscall
1395            .as_any()
1396            .downcast_ref::<TestHelperSyscall>()
1397            .unwrap()
1398            .get_mount_args();
1399
1400        assert_eq!(got.len(), 1);
1401        assert_eq!(expected, got[0]);
1402
1403        Ok(())
1404    }
1405
1406    #[test]
1407    #[cfg(feature = "v1")]
1408    #[ignore] // TODO: fix fd-based test
1409    fn test_emulated_subsystem_success() -> Result<()> {
1410        // arrange
1411        let tmp = tempfile::tempdir().unwrap();
1412        let host_cgroup_mount = tmp.path().join("host_cgroup");
1413        let host_cgroup = host_cgroup_mount.join("cpu/container1");
1414        fs::create_dir_all(&host_cgroup)?;
1415
1416        let container_cgroup = Path::new("/container_cgroup");
1417        let mounter = Mount::new();
1418
1419        let spec_cgroup_mount = SpecMountBuilder::default()
1420            .destination(container_cgroup)
1421            .source("cgroup")
1422            .typ("cgroup")
1423            .build()
1424            .context("failed to build cgroup mount")?;
1425
1426        let mount_opts = MountOptions {
1427            root: tmp.path(),
1428            label: None,
1429            cgroup_ns: false,
1430        };
1431
1432        let subsystem_name = "cpu";
1433        let mut process_cgroups = HashMap::new();
1434        process_cgroups.insert("cpu".to_owned(), "container1".to_owned());
1435
1436        // act
1437        mounter
1438            .setup_emulated_subsystem(
1439                &spec_cgroup_mount,
1440                &mount_opts,
1441                subsystem_name,
1442                false,
1443                &host_cgroup_mount.join(subsystem_name),
1444                &process_cgroups,
1445            )
1446            .context("failed to setup emulated subsystem")?;
1447
1448        // assert
1449        let expected = MountArgs {
1450            source: Some(host_cgroup),
1451            target: tmp
1452                .path()
1453                .join_safely(container_cgroup)?
1454                .join(subsystem_name),
1455            fstype: Some("bind".to_owned()),
1456            flags: MsFlags::MS_BIND | MsFlags::MS_REC,
1457            data: Some("".to_owned()),
1458        };
1459
1460        let got = mounter
1461            .syscall
1462            .as_any()
1463            .downcast_ref::<TestHelperSyscall>()
1464            .unwrap()
1465            .get_mount_args();
1466
1467        assert_eq!(got.len(), 1);
1468        assert_eq!(expected, got[0]);
1469
1470        Ok(())
1471    }
1472
1473    #[test]
1474    #[cfg(feature = "v1")]
1475    #[ignore] // TODO: fix fd-based test
1476    fn test_mount_cgroup_v1() -> Result<()> {
1477        // arrange
1478        let tmp = tempfile::tempdir()?;
1479        let container_cgroup = PathBuf::from("/sys/fs/cgroup");
1480
1481        let spec_cgroup_mount = SpecMountBuilder::default()
1482            .destination(&container_cgroup)
1483            .source("cgroup")
1484            .typ("cgroup")
1485            .build()
1486            .context("failed to build cgroup mount")?;
1487
1488        let mount_opts = MountOptions {
1489            root: tmp.path(),
1490            label: None,
1491            cgroup_ns: true,
1492        };
1493
1494        let mounter = Mount::new();
1495
1496        // act
1497        mounter
1498            .mount_cgroup_v1(&spec_cgroup_mount, &mount_opts)
1499            .context("failed to mount cgroup v1")?;
1500
1501        // assert
1502        let mut got = mounter
1503            .syscall
1504            .as_any()
1505            .downcast_ref::<TestHelperSyscall>()
1506            .unwrap()
1507            .get_mount_args()
1508            .into_iter();
1509
1510        let host_mounts = libcgroups::v1::util::list_subsystem_mount_points()?;
1511        assert_eq!(got.len(), host_mounts.len() + 1);
1512
1513        let expected = MountArgs {
1514            source: Some(PathBuf::from("tmpfs".to_owned())),
1515            target: tmp.path().join_safely(&container_cgroup)?,
1516            fstype: Some("tmpfs".to_owned()),
1517            flags: MsFlags::MS_NOEXEC | MsFlags::MS_NOSUID | MsFlags::MS_NODEV,
1518            data: Some("mode=755".to_owned()),
1519        };
1520        assert_eq!(expected, got.next().unwrap());
1521
1522        for (host_mount, act) in host_mounts.iter().zip(got) {
1523            let subsystem_name = host_mount.file_name().and_then(|f| f.to_str()).unwrap();
1524            let expected = MountArgs {
1525                source: Some(PathBuf::from("cgroup".to_owned())),
1526                target: tmp
1527                    .path()
1528                    .join_safely(&container_cgroup)?
1529                    .join(subsystem_name),
1530                fstype: Some("cgroup".to_owned()),
1531                flags: MsFlags::MS_NOEXEC | MsFlags::MS_NOSUID | MsFlags::MS_NODEV,
1532                data: Some(
1533                    if subsystem_name == "systemd" {
1534                        format!("name={subsystem_name}")
1535                    } else {
1536                        subsystem_name.to_string()
1537                    }
1538                    .to_owned(),
1539                ),
1540            };
1541            assert_eq!(expected, act);
1542        }
1543
1544        Ok(())
1545    }
1546
1547    #[test]
1548    #[cfg(feature = "v2")]
1549    #[ignore] // TODO: fix fd-based test
1550    fn test_mount_cgroup_v2() -> Result<()> {
1551        // arrange
1552        let tmp = tempfile::tempdir().unwrap();
1553        let container_cgroup = PathBuf::from("/sys/fs/cgroup");
1554
1555        let spec_cgroup_mount = SpecMountBuilder::default()
1556            .destination(&container_cgroup)
1557            .source("cgroup")
1558            .typ("cgroup")
1559            .build()
1560            .context("failed to build cgroup mount")?;
1561
1562        let mount_opts = MountOptions {
1563            root: tmp.path(),
1564            label: None,
1565            cgroup_ns: true,
1566        };
1567
1568        let mounter = Mount::new();
1569        let flags = MsFlags::MS_NOEXEC | MsFlags::MS_NOSUID | MsFlags::MS_NODEV;
1570
1571        // act
1572        let mount_option_config = MountOptionConfig {
1573            flags,
1574            data: vec![],
1575            rec_attr: None,
1576        };
1577        mounter
1578            .mount_cgroup_v2(&spec_cgroup_mount, &mount_opts, &mount_option_config)
1579            .context("failed to mount cgroup v2")?;
1580
1581        // assert
1582        let expected = MountArgs {
1583            source: Some(PathBuf::from("cgroup".to_owned())),
1584            target: tmp.path().join_safely(container_cgroup)?,
1585            fstype: Some("cgroup2".to_owned()),
1586            flags: MsFlags::MS_NOEXEC | MsFlags::MS_NOSUID | MsFlags::MS_NODEV,
1587            data: Some("".to_owned()),
1588        };
1589
1590        let got = mounter
1591            .syscall
1592            .as_any()
1593            .downcast_ref::<TestHelperSyscall>()
1594            .unwrap()
1595            .get_mount_args();
1596
1597        assert_eq!(got.len(), 1);
1598        assert_eq!(expected, got[0]);
1599
1600        Ok(())
1601    }
1602
1603    #[test]
1604    fn test_find_parent_mount() -> anyhow::Result<()> {
1605        let mount_infos = vec![
1606            MountInfo {
1607                mnt_id: 11,
1608                pid: 10,
1609                majmin: "".to_string(),
1610                root: "/".to_string(),
1611                mount_point: PathBuf::from("/"),
1612                mount_options: Default::default(),
1613                opt_fields: vec![],
1614                fs_type: "ext4".to_string(),
1615                mount_source: Some("/dev/sda1".to_string()),
1616                super_options: Default::default(),
1617            },
1618            MountInfo {
1619                mnt_id: 12,
1620                pid: 11,
1621                majmin: "".to_string(),
1622                root: "/".to_string(),
1623                mount_point: PathBuf::from("/proc"),
1624                mount_options: Default::default(),
1625                opt_fields: vec![],
1626                fs_type: "proc".to_string(),
1627                mount_source: Some("proc".to_string()),
1628                super_options: Default::default(),
1629            },
1630        ];
1631
1632        let res = find_parent_mount(Path::new("/path/to/rootfs"), mount_infos)
1633            .context("failed to get parent mount")?;
1634        assert_eq!(res.mnt_id, 11);
1635        Ok(())
1636    }
1637
1638    #[test]
1639    fn test_find_parent_mount_with_empty_mount_infos() {
1640        let mount_infos = vec![];
1641        let res = find_parent_mount(Path::new("/path/to/rootfs"), mount_infos);
1642        assert!(res.is_err());
1643    }
1644
1645    #[test]
1646    fn test_check_proc_mount_proc_ok() -> Result<()> {
1647        let rootfs = tempfile::tempdir()?;
1648        let mounter = Mount::new();
1649
1650        let mount = SpecMountBuilder::default()
1651            .destination(PathBuf::from("/proc"))
1652            .typ("proc".to_string())
1653            .source(PathBuf::from("proc"))
1654            .build()?;
1655
1656        assert!(mounter.check_proc_mount(rootfs.path(), &mount).is_ok());
1657        Ok(())
1658    }
1659
1660    #[test]
1661    fn test_check_proc_mount_allowed_subpath() -> Result<()> {
1662        let rootfs = tempfile::tempdir()?;
1663        let uptime = rootfs.path().join("proc/uptime");
1664        std::fs::create_dir_all(uptime.parent().unwrap())?;
1665
1666        let mounter = Mount::new();
1667        let mount = SpecMountBuilder::default()
1668            .destination(PathBuf::from("/proc/uptime"))
1669            .typ("bind".to_string())
1670            .source(uptime)
1671            .build()?;
1672
1673        assert!(mounter.check_proc_mount(rootfs.path(), &mount).is_ok());
1674        Ok(())
1675    }
1676
1677    #[test]
1678    fn test_check_proc_mount_denied_subpath() -> Result<()> {
1679        let rootfs = tempfile::tempdir()?;
1680        let custom = rootfs.path().join("proc/custom");
1681        std::fs::create_dir_all(custom.parent().unwrap())?;
1682
1683        let mounter = Mount::new();
1684        let mount = SpecMountBuilder::default()
1685            .destination(PathBuf::from("/proc/custom"))
1686            .typ("bind".to_string())
1687            .source(custom)
1688            .build()?;
1689
1690        assert!(mounter.check_proc_mount(rootfs.path(), &mount).is_err());
1691        Ok(())
1692    }
1693
1694    #[test]
1695    fn setup_mount_proc_fails_if_destination_is_symlink() -> Result<()> {
1696        let tmp = tempfile::tempdir()?;
1697        let rootfs = tmp.path();
1698
1699        let symlink_path = rootfs.join("symlink");
1700        fs::create_dir_all(&symlink_path)?;
1701        let proc_path = rootfs.join("proc");
1702
1703        symlink(&symlink_path, &proc_path)?;
1704
1705        let mount = SpecMountBuilder::default()
1706            .destination(PathBuf::from("/proc"))
1707            .typ("proc")
1708            .source(proc_path)
1709            .build()?;
1710
1711        let options = MountOptions {
1712            root: rootfs,
1713            label: None,
1714            cgroup_ns: true,
1715        };
1716
1717        let m = Mount::new();
1718
1719        let res = m.setup_mount(&mount, &options);
1720
1721        // proc destination symlink should be rejected
1722        assert!(res.is_err());
1723        let err = format!("{:?}", res.err().unwrap());
1724        assert!(err.contains("must be mounted on ordinary directory"));
1725
1726        Ok(())
1727    }
1728
1729    #[test]
1730    fn setup_mount_sys_fails_if_destination_is_symlink() -> Result<()> {
1731        let tmp = tempfile::tempdir()?;
1732        let rootfs = tmp.path();
1733
1734        let symlink_path = rootfs.join("symlink");
1735        fs::create_dir_all(&symlink_path)?;
1736        let sys_path = rootfs.join("sys");
1737
1738        symlink(&symlink_path, &sys_path)?;
1739
1740        let mount = SpecMountBuilder::default()
1741            .destination(PathBuf::from("/sys"))
1742            .typ("sysfs")
1743            .source(sys_path)
1744            .build()?;
1745
1746        let options = MountOptions {
1747            root: rootfs,
1748            label: None,
1749            cgroup_ns: true,
1750        };
1751
1752        let m = Mount::new();
1753
1754        let res = m.setup_mount(&mount, &options);
1755
1756        // sys destination symlink should be rejected
1757        assert!(res.is_err());
1758        let err = format!("{:?}", res.err().unwrap());
1759        assert!(err.contains("must be mounted on ordinary directory"));
1760
1761        Ok(())
1762    }
1763}