Skip to main content

nucleus/filesystem/
mount.rs

1use crate::error::{NucleusError, Result};
2use nix::fcntl::{open, OFlag};
3use nix::mount::{mount, MsFlags};
4use nix::sys::stat::{fstat, makedev, mknod, Mode, SFlag};
5use nix::unistd::chroot;
6use std::fs::OpenOptions;
7use std::io::Read;
8use std::os::fd::AsRawFd;
9use std::os::unix::fs::OpenOptionsExt;
10use std::path::{Component, Path, PathBuf};
11use tracing::{debug, info, warn};
12
13/// Expected mount flags for audit verification.
14struct ExpectedMount {
15    path: &'static str,
16    required_flags: &'static [&'static str],
17    /// If true, the mount *must* exist in production mode. A missing critical
18    /// mount (e.g. /proc) is treated as a violation rather than silently skipped.
19    critical: bool,
20}
21
22/// Known mount paths and the flags they must carry in production mode.
23const PRODUCTION_MOUNT_EXPECTATIONS: &[ExpectedMount] = &[
24    ExpectedMount {
25        path: "/bin",
26        required_flags: &["ro", "nosuid", "nodev"],
27        critical: true,
28    },
29    ExpectedMount {
30        path: "/usr",
31        required_flags: &["ro", "nosuid", "nodev"],
32        critical: true,
33    },
34    ExpectedMount {
35        path: "/lib",
36        required_flags: &["ro", "nosuid", "nodev"],
37        critical: false, // not all rootfs layouts have /lib
38    },
39    ExpectedMount {
40        path: "/lib64",
41        required_flags: &["ro", "nosuid", "nodev"],
42        critical: false, // not all rootfs layouts have /lib64
43    },
44    ExpectedMount {
45        path: "/etc",
46        required_flags: &["ro", "nosuid", "nodev"],
47        critical: true,
48    },
49    ExpectedMount {
50        path: "/nix",
51        required_flags: &["ro", "nosuid", "nodev"],
52        critical: false, // only present on NixOS-based rootfs
53    },
54    ExpectedMount {
55        path: "/sbin",
56        required_flags: &["ro", "nosuid", "nodev"],
57        critical: false, // not all rootfs layouts have /sbin
58    },
59    ExpectedMount {
60        path: "/proc",
61        required_flags: &["nosuid", "nodev", "noexec"],
62        critical: true,
63    },
64    ExpectedMount {
65        path: "/run/secrets",
66        required_flags: &["nosuid", "nodev", "noexec"],
67        critical: false, // only present when secrets are configured
68    },
69];
70
71/// Normalize an absolute container destination path and reject traversal.
72///
73/// Returns a normalized absolute path containing only `RootDir` and `Normal`
74/// components. `.` segments are ignored; `..` and relative paths are rejected.
75pub fn normalize_container_destination(dest: &Path) -> Result<PathBuf> {
76    if !dest.is_absolute() {
77        return Err(NucleusError::ConfigError(format!(
78            "Container destination must be absolute: {:?}",
79            dest
80        )));
81    }
82
83    let mut normalized = PathBuf::from("/");
84    let mut saw_component = false;
85
86    for component in dest.components() {
87        match component {
88            Component::RootDir => {}
89            Component::CurDir => {}
90            Component::Normal(part) => {
91                normalized.push(part);
92                saw_component = true;
93            }
94            Component::ParentDir => {
95                return Err(NucleusError::ConfigError(format!(
96                    "Container destination must not contain parent traversal: {:?}",
97                    dest
98                )));
99            }
100            Component::Prefix(_) => {
101                return Err(NucleusError::ConfigError(format!(
102                    "Unsupported container destination prefix: {:?}",
103                    dest
104                )));
105            }
106        }
107    }
108
109    if !saw_component {
110        return Err(NucleusError::ConfigError(format!(
111            "Container destination must not be the root directory: {:?}",
112            dest
113        )));
114    }
115
116    Ok(normalized)
117}
118
119/// Resolve a validated container destination under a host-side root directory.
120pub fn resolve_container_destination(root: &Path, dest: &Path) -> Result<PathBuf> {
121    let normalized = normalize_container_destination(dest)?;
122    let relative = normalized.strip_prefix("/").map_err(|_| {
123        NucleusError::ConfigError(format!(
124            "Container destination is not absolute after normalization: {:?}",
125            normalized
126        ))
127    })?;
128    Ok(root.join(relative))
129}
130
131pub(crate) fn read_regular_file_nofollow(path: &Path) -> Result<Vec<u8>> {
132    let mut file = OpenOptions::new()
133        .read(true)
134        .custom_flags(libc::O_NOFOLLOW | libc::O_CLOEXEC)
135        .open(path)
136        .map_err(|e| {
137            NucleusError::FilesystemError(format!(
138                "Failed to open file {:?} with O_NOFOLLOW: {}",
139                path, e
140            ))
141        })?;
142
143    let metadata = file.metadata().map_err(|e| {
144        NucleusError::FilesystemError(format!("Failed to stat file {:?}: {}", path, e))
145    })?;
146    if !metadata.is_file() {
147        return Err(NucleusError::FilesystemError(format!(
148            "Expected regular file for {:?}, found non-file source",
149            path
150        )));
151    }
152
153    let mut content = Vec::new();
154    file.read_to_end(&mut content).map_err(|e| {
155        NucleusError::FilesystemError(format!("Failed to read file {:?}: {}", path, e))
156    })?;
157    Ok(content)
158}
159
160fn decode_mountinfo_field(field: &str) -> String {
161    let mut decoded = String::with_capacity(field.len());
162    let mut chars = field.chars().peekable();
163
164    while let Some(ch) = chars.next() {
165        if ch == '\\' {
166            let code: String = chars.by_ref().take(3).collect();
167            match code.as_str() {
168                "040" => decoded.push(' '),
169                "011" => decoded.push('\t'),
170                "012" => decoded.push('\n'),
171                "134" => decoded.push('\\'),
172                _ => {
173                    decoded.push('\\');
174                    decoded.push_str(&code);
175                }
176            }
177        } else {
178            decoded.push(ch);
179        }
180    }
181
182    decoded
183}
184
185fn parse_mountinfo_line(line: &str) -> Option<(String, std::collections::HashSet<String>)> {
186    let (left, _) = line.split_once(" - ")?;
187    let fields: Vec<&str> = left.split_whitespace().collect();
188    if fields.len() < 6 {
189        return None;
190    }
191
192    let mount_point = decode_mountinfo_field(fields[4]);
193    let options = fields[5]
194        .split(',')
195        .map(str::trim)
196        .filter(|opt| !opt.is_empty())
197        .map(str::to_string)
198        .collect();
199
200    Some((mount_point, options))
201}
202
203/// Audit all mounts in the container's mount namespace.
204///
205/// Reads `/proc/self/mountinfo` and verifies that each known mount point carries
206/// its expected per-mount flags. In production mode, any missing flag is fatal.
207/// Returns Ok(()) if all checks pass, or a list of violations.
208pub fn audit_mounts(production_mode: bool) -> Result<()> {
209    let mounts_content = std::fs::read_to_string("/proc/self/mountinfo").map_err(|e| {
210        NucleusError::FilesystemError(format!("Failed to read /proc/self/mountinfo: {}", e))
211    })?;
212    let mount_table: std::collections::HashMap<String, std::collections::HashSet<String>> =
213        mounts_content
214            .lines()
215            .filter_map(parse_mountinfo_line)
216            .collect();
217
218    let mut violations = Vec::new();
219
220    for expectation in PRODUCTION_MOUNT_EXPECTATIONS {
221        if let Some(options) = mount_table.get(expectation.path) {
222            for &flag in expectation.required_flags {
223                if !options.contains(flag) {
224                    let rendered = options
225                        .iter()
226                        .map(String::as_str)
227                        .collect::<Vec<_>>()
228                        .join(",");
229                    violations.push(format!(
230                        "Mount {} missing required flag '{}' (has: {})",
231                        expectation.path, flag, rendered
232                    ));
233                }
234            }
235        } else if expectation.critical && production_mode {
236            violations.push(format!(
237                "Critical mount {} is missing from the mount namespace",
238                expectation.path
239            ));
240        }
241    }
242
243    if violations.is_empty() {
244        info!("Mount audit passed: all expected flags verified");
245        Ok(())
246    } else if production_mode {
247        Err(NucleusError::FilesystemError(format!(
248            "Mount audit failed in production mode:\n  {}",
249            violations.join("\n  ")
250        )))
251    } else {
252        for v in &violations {
253            warn!("Mount audit: {}", v);
254        }
255        Ok(())
256    }
257}
258
259/// Create minimal filesystem structure in the new root
260pub fn create_minimal_fs(root: &Path) -> Result<()> {
261    info!("Creating minimal filesystem structure at {:?}", root);
262
263    // Create essential directories
264    let dirs = vec![
265        "dev",
266        "proc",
267        "sys",
268        "tmp",
269        "bin",
270        "sbin",
271        "usr",
272        "lib",
273        "lib64",
274        "etc",
275        "nix",
276        "nix/store",
277        "run",
278        "context",
279    ];
280
281    for dir in dirs {
282        let path = root.join(dir);
283        std::fs::create_dir_all(&path).map_err(|e| {
284            NucleusError::FilesystemError(format!("Failed to create directory {:?}: {}", path, e))
285        })?;
286    }
287
288    info!("Created minimal filesystem structure");
289
290    Ok(())
291}
292
293/// Create essential device nodes in /dev
294///
295/// In rootless mode, device node creation will fail gracefully
296pub fn create_dev_nodes(dev_path: &Path, include_tty: bool) -> Result<()> {
297    info!("Creating device nodes at {:?}", dev_path);
298
299    // Device nodes: (name, type, major, minor)
300    let mut devices = vec![
301        ("null", SFlag::S_IFCHR, 1, 3),
302        ("zero", SFlag::S_IFCHR, 1, 5),
303        ("full", SFlag::S_IFCHR, 1, 7),
304        ("random", SFlag::S_IFCHR, 1, 8),
305        ("urandom", SFlag::S_IFCHR, 1, 9),
306    ];
307    if include_tty {
308        devices.push(("tty", SFlag::S_IFCHR, 5, 0));
309    }
310
311    let mut created_count = 0;
312    let mut failed_count = 0;
313
314    for (name, dev_type, major, minor) in devices {
315        let path = dev_path.join(name);
316        let mode = Mode::from_bits_truncate(0o660);
317        let dev = makedev(major, minor);
318
319        match mknod(&path, dev_type, mode, dev) {
320            Ok(_) => {
321                info!("Created device node: {:?}", path);
322                created_count += 1;
323            }
324            Err(e) => {
325                // In rootless mode, mknod fails - this is expected
326                warn!(
327                    "Failed to create device node {:?}: {} (this is normal in rootless mode)",
328                    path, e
329                );
330                failed_count += 1;
331            }
332        }
333    }
334
335    if created_count > 0 {
336        info!("Successfully created {} device nodes", created_count);
337    }
338    if failed_count > 0 {
339        info!("Skipped {} device nodes (rootless mode)", failed_count);
340    }
341
342    Ok(())
343}
344
345/// Bind mount a pre-built rootfs (e.g. a Nix store closure) into the container.
346///
347/// Instead of exposing the full host /bin, /usr, /lib, /lib64, /nix, this mounts
348/// a minimal, purpose-built root filesystem. Suitable for production services.
349pub fn bind_mount_rootfs(root: &Path, rootfs_path: &Path) -> Result<()> {
350    info!(
351        "Bind mounting production rootfs {:?} into container {:?}",
352        rootfs_path, root
353    );
354
355    if std::fs::symlink_metadata(rootfs_path).is_err() {
356        return Err(NucleusError::FilesystemError(format!(
357            "Rootfs path does not exist: {:?}",
358            rootfs_path
359        )));
360    }
361
362    // Bind mount the rootfs contents into the container root.
363    // The rootfs is expected to contain a standard FHS layout (/bin, /lib, /etc, etc.)
364    // produced by a Nix buildEnv or similar.
365    let subdirs = ["bin", "sbin", "lib", "lib64", "usr", "etc", "nix"];
366
367    for subdir in &subdirs {
368        let source = rootfs_path.join(subdir);
369        if !source.exists() {
370            debug!("Rootfs subdir {} not present, skipping", subdir);
371            continue;
372        }
373
374        let target = root.join(subdir);
375        std::fs::create_dir_all(&target).map_err(|e| {
376            NucleusError::FilesystemError(format!(
377                "Failed to create mount point {:?}: {}",
378                target, e
379            ))
380        })?;
381
382        mount(
383            Some(&source),
384            &target,
385            None::<&str>,
386            MsFlags::MS_BIND | MsFlags::MS_REC,
387            None::<&str>,
388        )
389        .map_err(|e| {
390            NucleusError::FilesystemError(format!(
391                "Failed to bind mount rootfs {:?} -> {:?}: {}",
392                source, target, e
393            ))
394        })?;
395
396        // Remount read-only
397        mount(
398            None::<&str>,
399            &target,
400            None::<&str>,
401            MsFlags::MS_REMOUNT
402                | MsFlags::MS_BIND
403                | MsFlags::MS_RDONLY
404                | MsFlags::MS_REC
405                | MsFlags::MS_NOSUID
406                | MsFlags::MS_NODEV,
407            None::<&str>,
408        )
409        .map_err(|e| {
410            NucleusError::FilesystemError(format!(
411                "Failed to remount rootfs {:?} read-only: {}",
412                target, e
413            ))
414        })?;
415
416        info!("Mounted rootfs/{} read-only", subdir);
417    }
418
419    Ok(())
420}
421
422/// Bind mount essential host directories into container
423///
424/// This allows host binaries to be accessible inside the container.
425/// Used in agent mode. Production mode should use bind_mount_rootfs() instead.
426pub fn bind_mount_host_paths(root: &Path, best_effort: bool) -> Result<()> {
427    info!("Bind mounting host paths into container");
428
429    // Essential paths to bind mount (read-only)
430    let host_paths = vec![
431        "/bin", "/usr", "/lib", "/lib64", "/nix", // For NixOS
432    ];
433
434    for host_path in host_paths {
435        let host = Path::new(host_path);
436
437        // Only mount if the path exists on the host
438        if !host.exists() {
439            debug!("Skipping {} (not present on host)", host_path);
440            continue;
441        }
442
443        let container_path = root.join(host_path.trim_start_matches('/'));
444
445        // Create mount point
446        if let Err(e) = std::fs::create_dir_all(&container_path) {
447            if best_effort {
448                warn!("Failed to create mount point {:?}: {}", container_path, e);
449                continue;
450            }
451            return Err(NucleusError::FilesystemError(format!(
452                "Failed to create mount point {:?}: {}",
453                container_path, e
454            )));
455        }
456
457        // Attempt bind mount
458        // Note: Linux ignores MS_RDONLY on the initial bind mount call.
459        // A second remount is required to actually enforce read-only.
460        match mount(
461            Some(host),
462            &container_path,
463            None::<&str>,
464            MsFlags::MS_BIND | MsFlags::MS_REC,
465            None::<&str>,
466        ) {
467            Ok(_) => {
468                // Remount as read-only – required because MS_RDONLY is ignored on initial bind
469                mount(
470                    None::<&str>,
471                    &container_path,
472                    None::<&str>,
473                    MsFlags::MS_REMOUNT
474                        | MsFlags::MS_BIND
475                        | MsFlags::MS_RDONLY
476                        | MsFlags::MS_REC
477                        | MsFlags::MS_NOSUID
478                        | MsFlags::MS_NODEV,
479                    None::<&str>,
480                )
481                .map_err(|e| {
482                    NucleusError::FilesystemError(format!(
483                        "Failed to remount {} as read-only: {}",
484                        host_path, e
485                    ))
486                })?;
487                info!(
488                    "Bind mounted {} to {:?} (read-only)",
489                    host_path, container_path
490                );
491            }
492            Err(e) => {
493                if best_effort {
494                    warn!(
495                        "Failed to bind mount {}: {} (continuing anyway)",
496                        host_path, e
497                    );
498                } else {
499                    return Err(NucleusError::FilesystemError(format!(
500                        "Failed to bind mount {}: {}",
501                        host_path, e
502                    )));
503                }
504            }
505        }
506    }
507
508    Ok(())
509}
510
511/// H7: Sensitive host paths that must not be bind-mounted into containers.
512const DENIED_BIND_MOUNT_SOURCES_EXACT: &[&str] = &[
513    "/",
514    "/etc/shadow",
515    "/etc/sudoers",
516    "/etc/passwd",
517    "/etc/gshadow",
518];
519
520/// Sensitive host subtrees that must not be exposed to a container at all.
521const DENIED_BIND_MOUNT_SOURCE_PREFIXES: &[&str] = &["/proc", "/sys", "/dev", "/boot"];
522
523fn normalize_bind_mount_source_for_policy(source: &Path) -> Result<PathBuf> {
524    if !source.is_absolute() {
525        return Err(NucleusError::ConfigError(format!(
526            "Bind mount source must be absolute: {:?}",
527            source
528        )));
529    }
530
531    let mut normalized = PathBuf::from("/");
532
533    for component in source.components() {
534        match component {
535            Component::RootDir => {}
536            Component::CurDir => {}
537            Component::Normal(part) => normalized.push(part),
538            Component::ParentDir => {
539                normalized.pop();
540                if normalized.as_os_str().is_empty() {
541                    normalized.push("/");
542                }
543            }
544            Component::Prefix(_) => {
545                return Err(NucleusError::ConfigError(format!(
546                    "Unsupported bind mount source prefix: {:?}",
547                    source
548                )));
549            }
550        }
551    }
552
553    Ok(normalized)
554}
555
556fn reject_denied_bind_mount_source(source: &Path) -> Result<()> {
557    for denied in DENIED_BIND_MOUNT_SOURCES_EXACT {
558        if source == Path::new(denied) {
559            return Err(NucleusError::ConfigError(format!(
560                "Bind mount source '{}' is a sensitive host path and cannot be mounted into containers",
561                source.display()
562            )));
563        }
564    }
565
566    for denied in DENIED_BIND_MOUNT_SOURCE_PREFIXES {
567        let denied_path = Path::new(denied);
568        if source == denied_path || source.starts_with(denied_path) {
569            return Err(NucleusError::ConfigError(format!(
570                "Bind mount source '{}' is under sensitive host path '{}' and cannot be mounted into containers",
571                source.display(),
572                denied
573            )));
574        }
575    }
576
577    Ok(())
578}
579
580/// Validate that a bind mount source is not a sensitive host path or subtree.
581pub fn validate_bind_mount_source(source: &Path) -> Result<()> {
582    let normalized = normalize_bind_mount_source_for_policy(source)?;
583    reject_denied_bind_mount_source(&normalized)?;
584
585    let canonical = std::fs::canonicalize(source).map_err(|e| {
586        NucleusError::ConfigError(format!(
587            "Failed to resolve bind mount source {:?}: {}",
588            source, e
589        ))
590    })?;
591    reject_denied_bind_mount_source(&canonical)
592}
593
594/// Mount persistent bind volumes and ephemeral tmpfs volumes into the container root.
595pub fn mount_volumes(root: &Path, volumes: &[crate::container::VolumeMount]) -> Result<()> {
596    use crate::container::VolumeSource;
597
598    if volumes.is_empty() {
599        return Ok(());
600    }
601
602    info!("Mounting {} volume(s) into container", volumes.len());
603
604    for volume in volumes {
605        let dest = resolve_container_destination(root, &volume.dest)?;
606
607        match &volume.source {
608            VolumeSource::Bind { source } => {
609                // H7: Deny bind-mounting sensitive host paths
610                validate_bind_mount_source(source)?;
611
612                // Use symlink_metadata (lstat) instead of .exists() to avoid
613                // following symlinks in the existence check (O_NOFOLLOW semantics).
614                if std::fs::symlink_metadata(source).is_err() {
615                    return Err(NucleusError::FilesystemError(format!(
616                        "Volume source does not exist: {:?}",
617                        source
618                    )));
619                }
620
621                if let Some(parent) = dest.parent() {
622                    std::fs::create_dir_all(parent).map_err(|e| {
623                        NucleusError::FilesystemError(format!(
624                            "Failed to create volume mount parent {:?}: {}",
625                            parent, e
626                        ))
627                    })?;
628                }
629
630                let recursive = source.is_dir();
631                if source.is_file() {
632                    std::fs::write(&dest, "").map_err(|e| {
633                        NucleusError::FilesystemError(format!(
634                            "Failed to create volume mount point {:?}: {}",
635                            dest, e
636                        ))
637                    })?;
638                } else {
639                    std::fs::create_dir_all(&dest).map_err(|e| {
640                        NucleusError::FilesystemError(format!(
641                            "Failed to create volume mount dir {:?}: {}",
642                            dest, e
643                        ))
644                    })?;
645                }
646
647                let initial_flags = if recursive {
648                    MsFlags::MS_BIND | MsFlags::MS_REC
649                } else {
650                    MsFlags::MS_BIND
651                };
652                mount(
653                    Some(source.as_path()),
654                    &dest,
655                    None::<&str>,
656                    initial_flags,
657                    None::<&str>,
658                )
659                .map_err(|e| {
660                    NucleusError::FilesystemError(format!(
661                        "Failed to bind mount volume {:?} -> {:?}: {}",
662                        source, dest, e
663                    ))
664                })?;
665
666                let mut remount_flags =
667                    MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_NOSUID | MsFlags::MS_NODEV;
668                if recursive {
669                    remount_flags |= MsFlags::MS_REC;
670                }
671                if volume.read_only {
672                    remount_flags |= MsFlags::MS_RDONLY;
673                }
674
675                mount(
676                    None::<&str>,
677                    &dest,
678                    None::<&str>,
679                    remount_flags,
680                    None::<&str>,
681                )
682                .map_err(|e| {
683                    NucleusError::FilesystemError(format!(
684                        "Failed to remount volume {:?} with final flags: {}",
685                        dest, e
686                    ))
687                })?;
688
689                info!(
690                    "Mounted bind volume {:?} -> {:?} ({})",
691                    source,
692                    volume.dest,
693                    if volume.read_only { "ro" } else { "rw" }
694                );
695            }
696            VolumeSource::Tmpfs { size } => {
697                std::fs::create_dir_all(&dest).map_err(|e| {
698                    NucleusError::FilesystemError(format!(
699                        "Failed to create tmpfs mount dir {:?}: {}",
700                        dest, e
701                    ))
702                })?;
703
704                // M8: Validate size parameter to prevent option injection.
705                // Only allow digits, optionally followed by K/M/G suffix.
706                if let Some(value) = size.as_ref() {
707                    let valid = value
708                        .chars()
709                        .all(|c| c.is_ascii_digit() || "kKmMgG".contains(c));
710                    if !valid || value.is_empty() {
711                        return Err(NucleusError::FilesystemError(format!(
712                            "Invalid tmpfs size value '{}': only digits with optional K/M/G suffix allowed",
713                            value
714                        )));
715                    }
716                }
717
718                // M7: Default to 64MB instead of half of physical RAM to
719                // prevent memory DoS from unbounded tmpfs volumes.
720                let mount_data = size
721                    .as_ref()
722                    .map(|value| format!("size={},mode=0700", value))
723                    .unwrap_or_else(|| "size=64M,mode=0700".to_string());
724
725                let mut flags = MsFlags::MS_NOSUID | MsFlags::MS_NODEV;
726                if volume.read_only {
727                    flags |= MsFlags::MS_RDONLY;
728                }
729                mount(
730                    Some("tmpfs"),
731                    &dest,
732                    Some("tmpfs"),
733                    flags,
734                    Some(mount_data.as_str()),
735                )
736                .map_err(|e| {
737                    NucleusError::FilesystemError(format!(
738                        "Failed to mount tmpfs volume at {:?}: {}",
739                        dest, e
740                    ))
741                })?;
742
743                info!(
744                    "Mounted tmpfs volume at {:?}{}{}",
745                    volume.dest,
746                    size.as_ref()
747                        .map(|value| format!(" (size={})", value))
748                        .unwrap_or_default(),
749                    if volume.read_only { " (ro)" } else { "" }
750                );
751            }
752        }
753    }
754
755    Ok(())
756}
757
758/// Mount procfs at the given path
759///
760/// In rootless mode, procfs mounting should work due to user namespace capabilities.
761/// When `hide_pids` is true, mounts with hidepid=2 so processes cannot enumerate
762/// other PIDs (production hardening).
763pub fn mount_procfs(
764    proc_path: &Path,
765    best_effort: bool,
766    read_only: bool,
767    hide_pids: bool,
768) -> Result<()> {
769    info!(
770        "Mounting procfs at {:?} (hidepid={})",
771        proc_path,
772        if hide_pids { "2" } else { "0" }
773    );
774
775    let mount_data: Option<&str> = if hide_pids { Some("hidepid=2") } else { None };
776    let mut used_hidepid = hide_pids;
777
778    let mounted = match mount(
779        Some("proc"),
780        proc_path,
781        Some("proc"),
782        MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
783        mount_data,
784    ) {
785        Ok(_) => true,
786        Err(e) if hide_pids && best_effort => {
787            // Some kernels reject hidepid in user namespaces even though the
788            // private PID namespace still prevents host PID enumeration.
789            warn!(
790                "Failed to mount procfs with hidepid=2: {} (retrying without hidepid)",
791                e
792            );
793            match mount(
794                Some("proc"),
795                proc_path,
796                Some("proc"),
797                MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
798                None::<&str>,
799            ) {
800                Ok(_) => {
801                    used_hidepid = false;
802                    true
803                }
804                Err(e) => {
805                    warn!("Failed to mount procfs: {} (continuing anyway)", e);
806                    false
807                }
808            }
809        }
810        Err(e) => {
811            if best_effort {
812                warn!("Failed to mount procfs: {} (continuing anyway)", e);
813                false
814            } else {
815                return Err(NucleusError::FilesystemError(format!(
816                    "Failed to mount procfs: {}",
817                    e
818                )));
819            }
820        }
821    };
822
823    if mounted {
824        if read_only {
825            mount(
826                None::<&str>,
827                proc_path,
828                None::<&str>,
829                MsFlags::MS_REMOUNT
830                    | MsFlags::MS_RDONLY
831                    | MsFlags::MS_NOSUID
832                    | MsFlags::MS_NODEV
833                    | MsFlags::MS_NOEXEC,
834                None::<&str>,
835            )
836            .map_err(|e| {
837                NucleusError::FilesystemError(format!("Failed to remount procfs read-only: {}", e))
838            })?;
839            if hide_pids && !used_hidepid {
840                info!("Successfully mounted procfs without hidepid (read-only)");
841            } else {
842                info!("Successfully mounted procfs (read-only)");
843            }
844        } else if hide_pids && !used_hidepid {
845            info!("Successfully mounted procfs without hidepid");
846        } else {
847            info!("Successfully mounted procfs");
848        }
849    }
850
851    Ok(())
852}
853
854/// Paths to mask with /dev/null (files) – matches OCI runtime spec masked paths.
855/// Exposed for testing; the canonical list of sensitive /proc entries that must
856/// be hidden from container processes.
857pub const PROC_NULL_MASKED: &[&str] = &[
858    "kallsyms",
859    "kcore",
860    "sched_debug",
861    "timer_list",
862    "timer_stats",
863    "keys",
864    "latency_stats",
865    "config.gz",
866    "sysrq-trigger",
867    "kpagecount",
868    "kpageflags",
869    "kpagecgroup",
870];
871
872/// Paths to remount read-only – matches OCI runtime spec readonlyPaths.
873pub const PROC_READONLY_PATHS: &[&str] = &["bus", "fs", "irq", "sys"];
874
875/// Paths to mask with empty tmpfs (directories).
876pub const PROC_TMPFS_MASKED: &[&str] = &["acpi", "scsi"];
877
878fn remount_proc_path_readonly(target: &Path) -> Result<()> {
879    mount(
880        Some(target),
881        target,
882        None::<&str>,
883        MsFlags::MS_BIND | MsFlags::MS_REC,
884        None::<&str>,
885    )
886    .map_err(|e| {
887        NucleusError::FilesystemError(format!(
888            "Failed to bind-mount {:?} onto itself for read-only remount: {}",
889            target, e
890        ))
891    })?;
892
893    mount(
894        None::<&str>,
895        target,
896        None::<&str>,
897        MsFlags::MS_REMOUNT
898            | MsFlags::MS_BIND
899            | MsFlags::MS_RDONLY
900            | MsFlags::MS_NOSUID
901            | MsFlags::MS_NODEV
902            | MsFlags::MS_NOEXEC,
903        None::<&str>,
904    )
905    .map_err(|e| {
906        NucleusError::FilesystemError(format!("Failed to remount {:?} read-only: {}", target, e))
907    })?;
908
909    Ok(())
910}
911
912/// Mask sensitive /proc paths by bind-mounting /dev/null or tmpfs over them
913///
914/// This reduces kernel information leakage from the container. Follows OCI runtime
915/// conventions for masked paths.
916///
917/// SEC-06: When `production` is true, failures to mask critical paths
918/// (kcore, kallsyms, sysrq-trigger) are fatal instead of warn-and-continue.
919pub fn mask_proc_paths(proc_path: &Path, production: bool) -> Result<()> {
920    info!("Masking sensitive /proc paths");
921
922    const CRITICAL_PROC_PATHS: &[&str] = &["kcore", "kallsyms", "sysrq-trigger"];
923
924    for name in PROC_READONLY_PATHS {
925        let target = proc_path.join(name);
926        if !target.exists() {
927            continue;
928        }
929        match remount_proc_path_readonly(&target) {
930            Ok(_) => debug!("Remounted /proc/{} read-only", name),
931            Err(e) => {
932                if production {
933                    return Err(NucleusError::FilesystemError(format!(
934                        "Failed to remount /proc/{} read-only in production mode: {}",
935                        name, e
936                    )));
937                }
938                warn!(
939                    "Failed to remount /proc/{} read-only: {} (continuing)",
940                    name, e
941                );
942            }
943        }
944    }
945
946    let dev_null = Path::new("/dev/null");
947
948    for name in PROC_NULL_MASKED {
949        let target = proc_path.join(name);
950        if !target.exists() {
951            continue;
952        }
953        match mount(
954            Some(dev_null),
955            &target,
956            None::<&str>,
957            MsFlags::MS_BIND,
958            None::<&str>,
959        ) {
960            Ok(_) => {
961                // Remount read-only: Linux ignores MS_RDONLY on the initial bind mount,
962                // so a separate MS_REMOUNT|MS_BIND|MS_RDONLY call is required.
963                if let Err(e) = mount(
964                    None::<&str>,
965                    &target,
966                    None::<&str>,
967                    MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_RDONLY,
968                    None::<&str>,
969                ) {
970                    if production && CRITICAL_PROC_PATHS.contains(name) {
971                        return Err(NucleusError::FilesystemError(format!(
972                            "Failed to remount /proc/{} read-only in production mode: {}",
973                            name, e
974                        )));
975                    }
976                    warn!(
977                        "Failed to remount /proc/{} read-only: {} (continuing)",
978                        name, e
979                    );
980                }
981                debug!("Masked /proc/{} (read-only)", name);
982            }
983            Err(e) => {
984                if production && CRITICAL_PROC_PATHS.contains(name) {
985                    return Err(NucleusError::FilesystemError(format!(
986                        "Failed to mask critical /proc/{} in production mode: {}",
987                        name, e
988                    )));
989                }
990                warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
991            }
992        }
993    }
994
995    for name in PROC_TMPFS_MASKED {
996        let target = proc_path.join(name);
997        if !target.exists() {
998            continue;
999        }
1000        match mount(
1001            Some("tmpfs"),
1002            &target,
1003            Some("tmpfs"),
1004            MsFlags::MS_RDONLY | MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
1005            Some("size=0"),
1006        ) {
1007            Ok(_) => debug!("Masked /proc/{}", name),
1008            Err(e) => {
1009                if production {
1010                    return Err(NucleusError::FilesystemError(format!(
1011                        "Failed to mask /proc/{} in production mode: {}",
1012                        name, e
1013                    )));
1014                }
1015                warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
1016            }
1017        }
1018    }
1019
1020    info!("Finished masking sensitive /proc paths");
1021    Ok(())
1022}
1023
1024/// Switch to new root filesystem using pivot_root or chroot
1025///
1026/// This implements the transition: populated -> pivoted
1027/// Fails closed if root switching cannot be established.
1028pub fn switch_root(new_root: &Path, allow_chroot_fallback: bool) -> Result<()> {
1029    info!("Switching root to {:?}", new_root);
1030
1031    match pivot_root_impl(new_root) {
1032        Ok(()) => {
1033            info!("Successfully switched root using pivot_root");
1034            Ok(())
1035        }
1036        Err(e) => {
1037            if allow_chroot_fallback {
1038                warn!(
1039                    "pivot_root failed ({}), falling back to chroot due to explicit \
1040                     configuration",
1041                    e
1042                );
1043                chroot_impl(new_root)
1044            } else {
1045                Err(NucleusError::PivotRootError(format!(
1046                    "pivot_root failed: {}. chroot fallback is disabled by default; use \
1047                     --allow-chroot-fallback to allow weaker isolation",
1048                    e
1049                )))
1050            }
1051        }
1052    }
1053}
1054
1055/// Implement root switch using pivot_root(2)
1056///
1057/// pivot_root is preferred over chroot because:
1058/// - More secure (old root can be unmounted)
1059/// - Works better with mount namespaces
1060fn pivot_root_impl(new_root: &Path) -> Result<()> {
1061    use nix::unistd::pivot_root;
1062
1063    // pivot_root requires new_root to be a mount point
1064    // and old_root to be under new_root
1065
1066    let old_root = new_root.join(".old_root");
1067    std::fs::create_dir_all(&old_root).map_err(|e| {
1068        NucleusError::PivotRootError(format!("Failed to create old_root directory: {}", e))
1069    })?;
1070
1071    // Perform pivot_root
1072    pivot_root(new_root, &old_root)
1073        .map_err(|e| NucleusError::PivotRootError(format!("pivot_root syscall failed: {}", e)))?;
1074
1075    // Change to new root
1076    std::env::set_current_dir("/")
1077        .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
1078
1079    // Unmount old root
1080    nix::mount::umount2("/.old_root", nix::mount::MntFlags::MNT_DETACH)
1081        .map_err(|e| NucleusError::PivotRootError(format!("Failed to unmount old root: {}", e)))?;
1082
1083    // Remove old root directory
1084    let _ = std::fs::remove_dir("/.old_root");
1085
1086    Ok(())
1087}
1088
1089/// Implement root switch using chroot(2)
1090///
1091/// chroot is less secure than pivot_root but works in more situations
1092fn chroot_impl(new_root: &Path) -> Result<()> {
1093    fn close_non_stdio_fds_after_chroot() -> Result<()> {
1094        // Any pre-chroot fd can still reach outside the jail, so close every
1095        // non-stdio descriptor before continuing setup inside the fallback root.
1096        let ret = unsafe { libc::syscall(libc::SYS_close_range, 3u32, u32::MAX, 0u32) };
1097        if ret == 0 {
1098            return Ok(());
1099        }
1100
1101        let max_fd = match unsafe { libc::sysconf(libc::_SC_OPEN_MAX) } {
1102            n if n > 3 && n <= i32::MAX as libc::c_long => n as i32,
1103            _ => 1024,
1104        };
1105
1106        for fd in 3..max_fd {
1107            if unsafe { libc::close(fd) } != 0 {
1108                let err = std::io::Error::last_os_error();
1109                if err.raw_os_error() != Some(libc::EBADF) {
1110                    return Err(NucleusError::PivotRootError(format!(
1111                        "Failed to close inherited fd {} after chroot: {}",
1112                        fd, err
1113                    )));
1114                }
1115            }
1116        }
1117
1118        Ok(())
1119    }
1120
1121    chroot(new_root)
1122        .map_err(|e| NucleusError::PivotRootError(format!("chroot syscall failed: {}", e)))?;
1123
1124    // Change to new root
1125    std::env::set_current_dir("/")
1126        .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
1127
1128    close_non_stdio_fds_after_chroot()?;
1129
1130    // L3: Drop CAP_SYS_CHROOT after chroot to prevent escape via nested chroot.
1131    if let Err(e) = caps::drop(
1132        None,
1133        caps::CapSet::Bounding,
1134        caps::Capability::CAP_SYS_CHROOT,
1135    ) {
1136        debug!(
1137            "Could not drop CAP_SYS_CHROOT after chroot: {} (may not be present)",
1138            e
1139        );
1140    }
1141    if let Err(e) = caps::drop(
1142        None,
1143        caps::CapSet::Effective,
1144        caps::Capability::CAP_SYS_CHROOT,
1145    ) {
1146        debug!(
1147            "Could not drop effective CAP_SYS_CHROOT: {} (may not be present)",
1148            e
1149        );
1150    }
1151    if let Err(e) = caps::drop(
1152        None,
1153        caps::CapSet::Permitted,
1154        caps::Capability::CAP_SYS_CHROOT,
1155    ) {
1156        debug!(
1157            "Could not drop permitted CAP_SYS_CHROOT: {} (may not be present)",
1158            e
1159        );
1160    }
1161
1162    info!("Successfully switched root using chroot (CAP_SYS_CHROOT dropped)");
1163
1164    Ok(())
1165}
1166
1167/// Mount secret files into the container root.
1168///
1169/// Each secret is bind-mounted read-only from its source to the destination
1170/// path inside the container. Intermediate directories are created as needed.
1171pub fn mount_secrets(root: &Path, secrets: &[crate::container::SecretMount]) -> Result<()> {
1172    if secrets.is_empty() {
1173        return Ok(());
1174    }
1175
1176    info!("Mounting {} secret(s) into container", secrets.len());
1177
1178    for secret in secrets {
1179        let source_fd = open(
1180            &secret.source,
1181            OFlag::O_PATH | OFlag::O_NOFOLLOW | OFlag::O_CLOEXEC,
1182            Mode::empty(),
1183        )
1184        .map_err(|e| {
1185            NucleusError::FilesystemError(format!(
1186                "Failed to open secret source {:?} with O_NOFOLLOW: {}",
1187                secret.source, e
1188            ))
1189        })?;
1190        let source_stat = fstat(&source_fd).map_err(|e| {
1191            NucleusError::FilesystemError(format!(
1192                "Failed to stat secret source {:?}: {}",
1193                secret.source, e
1194            ))
1195        })?;
1196        let source_kind = SFlag::from_bits_truncate(source_stat.st_mode);
1197        let source_is_file = source_kind == SFlag::S_IFREG;
1198        let source_is_dir = source_kind == SFlag::S_IFDIR;
1199        if !source_is_file && !source_is_dir {
1200            return Err(NucleusError::FilesystemError(format!(
1201                "Secret source {:?} must be a regular file or directory",
1202                secret.source
1203            )));
1204        }
1205        let source_fd_path = PathBuf::from(format!("/proc/self/fd/{}", source_fd.as_raw_fd()));
1206
1207        // Destination inside container root
1208        let dest = resolve_container_destination(root, &secret.dest)?;
1209
1210        // Create parent directories
1211        if let Some(parent) = dest.parent() {
1212            std::fs::create_dir_all(parent).map_err(|e| {
1213                NucleusError::FilesystemError(format!(
1214                    "Failed to create secret mount parent {:?}: {}",
1215                    parent, e
1216                ))
1217            })?;
1218        }
1219
1220        // Create mount point file
1221        if source_is_file {
1222            std::fs::write(&dest, "").map_err(|e| {
1223                NucleusError::FilesystemError(format!(
1224                    "Failed to create secret mount point {:?}: {}",
1225                    dest, e
1226                ))
1227            })?;
1228        } else {
1229            std::fs::create_dir_all(&dest).map_err(|e| {
1230                NucleusError::FilesystemError(format!(
1231                    "Failed to create secret mount dir {:?}: {}",
1232                    dest, e
1233                ))
1234            })?;
1235        }
1236
1237        // Bind mount read-only
1238        mount(
1239            Some(source_fd_path.as_path()),
1240            &dest,
1241            None::<&str>,
1242            MsFlags::MS_BIND,
1243            None::<&str>,
1244        )
1245        .map_err(|e| {
1246            NucleusError::FilesystemError(format!(
1247                "Failed to bind mount secret {:?}: {}",
1248                secret.source, e
1249            ))
1250        })?;
1251
1252        mount(
1253            None::<&str>,
1254            &dest,
1255            None::<&str>,
1256            MsFlags::MS_REMOUNT
1257                | MsFlags::MS_BIND
1258                | MsFlags::MS_RDONLY
1259                | MsFlags::MS_NOSUID
1260                | MsFlags::MS_NODEV
1261                | MsFlags::MS_NOEXEC,
1262            None::<&str>,
1263        )
1264        .map_err(|e| {
1265            NucleusError::FilesystemError(format!(
1266                "Failed to remount secret {:?} read-only: {}",
1267                dest, e
1268            ))
1269        })?;
1270
1271        // Apply configured file permissions on the mount point
1272        if source_is_file {
1273            use std::os::unix::fs::PermissionsExt;
1274            let perms = std::fs::Permissions::from_mode(secret.mode);
1275            if let Err(e) = std::fs::set_permissions(&dest, perms) {
1276                warn!(
1277                    "Failed to set mode {:04o} on secret {:?}: {} (bind mount may override)",
1278                    secret.mode, dest, e
1279                );
1280            }
1281        }
1282
1283        debug!(
1284            "Mounted secret {:?} -> {:?} (mode {:04o})",
1285            secret.source, secret.dest, secret.mode
1286        );
1287    }
1288
1289    Ok(())
1290}
1291
1292/// Mount secrets onto a dedicated in-memory tmpfs instead of bind-mounting host paths.
1293///
1294/// Creates a per-container tmpfs at `<root>/run/secrets` with MS_NOEXEC | MS_NOSUID | MS_NODEV,
1295/// copies secret contents into it, then zeros the read buffer. This ensures secrets
1296/// never reference host-side files after setup and are never persisted to disk.
1297pub fn mount_secrets_inmemory(
1298    root: &Path,
1299    secrets: &[crate::container::SecretMount],
1300    identity: &crate::container::ProcessIdentity,
1301) -> Result<()> {
1302    if secrets.is_empty() {
1303        return Ok(());
1304    }
1305
1306    info!("Mounting {} secret(s) on in-memory tmpfs", secrets.len());
1307
1308    let secrets_dir = root.join("run/secrets");
1309    std::fs::create_dir_all(&secrets_dir).map_err(|e| {
1310        NucleusError::FilesystemError(format!(
1311            "Failed to create secrets dir {:?}: {}",
1312            secrets_dir, e
1313        ))
1314    })?;
1315
1316    // Mount a size-limited tmpfs for secrets (16 MiB max)
1317    if let Err(e) = mount(
1318        Some("tmpfs"),
1319        &secrets_dir,
1320        Some("tmpfs"),
1321        MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
1322        Some("size=16m,mode=0700"),
1323    ) {
1324        let _ = std::fs::remove_dir_all(&secrets_dir);
1325        return Err(NucleusError::FilesystemError(format!(
1326            "Failed to mount secrets tmpfs at {:?}: {}",
1327            secrets_dir, e
1328        )));
1329    }
1330
1331    if !identity.is_root() {
1332        nix::unistd::chown(
1333            &secrets_dir,
1334            Some(nix::unistd::Uid::from_raw(identity.uid)),
1335            Some(nix::unistd::Gid::from_raw(identity.gid)),
1336        )
1337        .map_err(|e| {
1338            let _ = nix::mount::umount2(&secrets_dir, nix::mount::MntFlags::MNT_DETACH);
1339            let _ = std::fs::remove_dir_all(&secrets_dir);
1340            NucleusError::FilesystemError(format!(
1341                "Failed to set /run/secrets owner to {}:{}: {}",
1342                identity.uid, identity.gid, e
1343            ))
1344        })?;
1345    }
1346
1347    // Rollback: unmount tmpfs and remove dir if any secret fails
1348    let result = mount_secrets_inmemory_inner(&secrets_dir, root, secrets, identity);
1349    if let Err(ref e) = result {
1350        let _ = nix::mount::umount2(&secrets_dir, nix::mount::MntFlags::MNT_DETACH);
1351        let _ = std::fs::remove_dir_all(&secrets_dir);
1352        return Err(NucleusError::FilesystemError(format!(
1353            "Secret mount failed (rolled back): {}",
1354            e
1355        )));
1356    }
1357
1358    info!("All secrets mounted on in-memory tmpfs");
1359    Ok(())
1360}
1361
1362fn mount_secrets_inmemory_inner(
1363    secrets_dir: &Path,
1364    root: &Path,
1365    secrets: &[crate::container::SecretMount],
1366    identity: &crate::container::ProcessIdentity,
1367) -> Result<()> {
1368    for secret in secrets {
1369        let mut content = read_regular_file_nofollow(&secret.source)?;
1370
1371        // Determine destination path inside the secrets tmpfs
1372        let dest = resolve_container_destination(secrets_dir, &secret.dest)?;
1373
1374        // Create parent directories within the tmpfs
1375        if let Some(parent) = dest.parent() {
1376            std::fs::create_dir_all(parent).map_err(|e| {
1377                NucleusError::FilesystemError(format!(
1378                    "Failed to create secret parent dir {:?}: {}",
1379                    parent, e
1380                ))
1381            })?;
1382        }
1383
1384        // Write secret content to tmpfs
1385        std::fs::write(&dest, &content).map_err(|e| {
1386            NucleusError::FilesystemError(format!("Failed to write secret to {:?}: {}", dest, e))
1387        })?;
1388
1389        // Set permissions
1390        {
1391            use std::os::unix::fs::PermissionsExt;
1392            let perms = std::fs::Permissions::from_mode(secret.mode);
1393            std::fs::set_permissions(&dest, perms).map_err(|e| {
1394                NucleusError::FilesystemError(format!(
1395                    "Failed to set permissions on secret {:?}: {}",
1396                    dest, e
1397                ))
1398            })?;
1399        }
1400
1401        if !identity.is_root() {
1402            nix::unistd::chown(
1403                &dest,
1404                Some(nix::unistd::Uid::from_raw(identity.uid)),
1405                Some(nix::unistd::Gid::from_raw(identity.gid)),
1406            )
1407            .map_err(|e| {
1408                NucleusError::FilesystemError(format!(
1409                    "Failed to set permissions owner on secret {:?} to {}:{}: {}",
1410                    dest, identity.uid, identity.gid, e
1411                ))
1412            })?;
1413        }
1414
1415        // Zero the in-memory buffer
1416        zeroize::Zeroize::zeroize(&mut content);
1417        drop(content);
1418
1419        // Also bind-mount the secret to its expected container path for compatibility
1420        let container_dest = resolve_container_destination(root, &secret.dest)?;
1421        if container_dest != dest {
1422            if let Some(parent) = container_dest.parent() {
1423                std::fs::create_dir_all(parent).map_err(|e| {
1424                    NucleusError::FilesystemError(format!(
1425                        "Failed to create secret mount parent {:?}: {}",
1426                        parent, e
1427                    ))
1428                })?;
1429            }
1430
1431            std::fs::write(&container_dest, "").map_err(|e| {
1432                NucleusError::FilesystemError(format!(
1433                    "Failed to create secret mount point {:?}: {}",
1434                    container_dest, e
1435                ))
1436            })?;
1437
1438            mount(
1439                Some(dest.as_path()),
1440                &container_dest,
1441                None::<&str>,
1442                MsFlags::MS_BIND,
1443                None::<&str>,
1444            )
1445            .map_err(|e| {
1446                NucleusError::FilesystemError(format!(
1447                    "Failed to bind mount secret {:?} -> {:?}: {}",
1448                    dest, container_dest, e
1449                ))
1450            })?;
1451
1452            mount(
1453                None::<&str>,
1454                &container_dest,
1455                None::<&str>,
1456                MsFlags::MS_REMOUNT
1457                    | MsFlags::MS_BIND
1458                    | MsFlags::MS_RDONLY
1459                    | MsFlags::MS_NOSUID
1460                    | MsFlags::MS_NODEV
1461                    | MsFlags::MS_NOEXEC,
1462                None::<&str>,
1463            )
1464            .map_err(|e| {
1465                NucleusError::FilesystemError(format!(
1466                    "Failed to remount secret {:?} read-only: {}",
1467                    container_dest, e
1468                ))
1469            })?;
1470        }
1471
1472        debug!(
1473            "Secret {:?} -> {:?} (in-memory tmpfs, mode {:04o})",
1474            secret.source, secret.dest, secret.mode
1475        );
1476    }
1477
1478    Ok(())
1479}
1480
1481#[cfg(test)]
1482mod tests {
1483    use super::*;
1484    use std::os::unix::fs::symlink;
1485
1486    #[test]
1487    fn test_validate_bind_mount_source_rejects_sensitive_subtrees() {
1488        for path in ["/proc/sys", "/sys/fs/cgroup", "/dev/kmsg", "/boot"] {
1489            let err = validate_bind_mount_source(Path::new(path)).unwrap_err();
1490            assert!(
1491                err.to_string().contains("sensitive host path"),
1492                "expected sensitive-path rejection for {path}, got: {err}"
1493            );
1494        }
1495    }
1496
1497    #[test]
1498    fn test_validate_bind_mount_source_allows_regular_host_paths() {
1499        let temp = tempfile::TempDir::new().unwrap();
1500        let safe_path = temp.path().join("data");
1501        std::fs::create_dir(&safe_path).unwrap();
1502
1503        validate_bind_mount_source(&safe_path).unwrap();
1504    }
1505
1506    #[test]
1507    fn test_validate_bind_mount_source_normalizes_parent_components_before_filtering() {
1508        let temp = tempfile::TempDir::new().unwrap();
1509        let safe_path = temp.path().join("data");
1510        std::fs::create_dir(&safe_path).unwrap();
1511
1512        validate_bind_mount_source(&safe_path.join("../data")).unwrap();
1513    }
1514
1515    #[test]
1516    fn test_proc_mask_includes_sysrq_trigger() {
1517        assert!(
1518            PROC_NULL_MASKED.contains(&"sysrq-trigger"),
1519            "/proc/sysrq-trigger must be masked to prevent host DoS"
1520        );
1521    }
1522
1523    #[test]
1524    fn test_proc_mask_includes_timer_stats() {
1525        assert!(
1526            PROC_NULL_MASKED.contains(&"timer_stats"),
1527            "/proc/timer_stats must be masked to prevent kernel info leakage"
1528        );
1529    }
1530
1531    #[test]
1532    fn test_proc_mask_includes_kpage_files() {
1533        for path in &["kpagecount", "kpageflags", "kpagecgroup"] {
1534            assert!(
1535                PROC_NULL_MASKED.contains(path),
1536                "/proc/{} must be masked to prevent host memory layout leakage",
1537                path
1538            );
1539        }
1540    }
1541
1542    #[test]
1543    fn test_proc_mask_includes_oci_standard_paths() {
1544        // OCI runtime spec required masked paths
1545        for path in &["kallsyms", "kcore", "sched_debug", "keys", "config.gz"] {
1546            assert!(
1547                PROC_NULL_MASKED.contains(path),
1548                "/proc/{} must be in null-masked list (OCI spec)",
1549                path
1550            );
1551        }
1552        for path in &["acpi", "scsi"] {
1553            assert!(
1554                PROC_TMPFS_MASKED.contains(path),
1555                "/proc/{} must be in tmpfs-masked list (OCI spec)",
1556                path
1557            );
1558        }
1559        for path in &["bus", "fs", "irq", "sys"] {
1560            assert!(
1561                PROC_READONLY_PATHS.contains(path),
1562                "/proc/{} must be in read-only remount list (OCI spec)",
1563                path
1564            );
1565            assert!(
1566                !PROC_TMPFS_MASKED.contains(path),
1567                "/proc/{} must stay visible read-only, not hidden behind tmpfs",
1568                path
1569            );
1570        }
1571    }
1572
1573    #[test]
1574    fn test_parse_mountinfo_line_uses_mountinfo_mount_point_and_flags() {
1575        let line =
1576            "36 25 0:32 / /run/secrets rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,size=1024k";
1577        let (mount_point, flags) = parse_mountinfo_line(line).unwrap();
1578
1579        assert_eq!(mount_point, "/run/secrets");
1580        assert!(flags.contains("nosuid"));
1581        assert!(flags.contains("nodev"));
1582        assert!(flags.contains("noexec"));
1583    }
1584
1585    #[test]
1586    fn test_parse_mountinfo_line_decodes_escaped_mount_points() {
1587        let line = "41 25 0:40 / /path\\040with\\040spaces ro,nosuid,nodev - ext4 /dev/root ro";
1588        let (mount_point, flags) = parse_mountinfo_line(line).unwrap();
1589
1590        assert_eq!(mount_point, "/path with spaces");
1591        assert!(flags.contains("ro"));
1592    }
1593
1594    #[test]
1595    fn test_chroot_impl_closes_non_stdio_fds() {
1596        let source = include_str!("mount.rs");
1597        let fn_start = source.find("fn chroot_impl").unwrap();
1598        let after = &source[fn_start..];
1599        let open = after.find('{').unwrap();
1600        let mut depth = 0u32;
1601        let mut fn_end = open;
1602        for (i, ch) in after[open..].char_indices() {
1603            match ch {
1604                '{' => depth += 1,
1605                '}' => {
1606                    depth -= 1;
1607                    if depth == 0 {
1608                        fn_end = open + i + 1;
1609                        break;
1610                    }
1611                }
1612                _ => {}
1613            }
1614        }
1615        let body = &after[..fn_end];
1616        assert!(
1617            body.contains("close_non_stdio_fds_after_chroot()?"),
1618            "chroot fallback must close inherited non-stdio fds before continuing setup"
1619        );
1620    }
1621
1622    #[test]
1623    fn test_read_regular_file_nofollow_reads_regular_file() {
1624        let temp = tempfile::tempdir().unwrap();
1625        let path = temp.path().join("secret.txt");
1626        std::fs::write(&path, "supersecret").unwrap();
1627
1628        let content = read_regular_file_nofollow(&path).unwrap();
1629        assert_eq!(content, b"supersecret");
1630    }
1631
1632    #[test]
1633    fn test_read_regular_file_nofollow_rejects_symlink() {
1634        let temp = tempfile::tempdir().unwrap();
1635        let target = temp.path().join("target.txt");
1636        let link = temp.path().join("secret-link");
1637        std::fs::write(&target, "supersecret").unwrap();
1638        symlink(&target, &link).unwrap();
1639
1640        let err = read_regular_file_nofollow(&link).unwrap_err();
1641        assert!(
1642            err.to_string().contains("O_NOFOLLOW"),
1643            "symlink reads must fail via O_NOFOLLOW"
1644        );
1645    }
1646}