Skip to main content

nucleus/filesystem/
mount.rs

1use crate::error::{NucleusError, Result};
2use nix::fcntl::{open, OFlag};
3use nix::mount::{mount, MsFlags};
4use nix::sys::stat::{fstat, makedev, mknod, Mode, SFlag};
5use nix::unistd::chroot;
6use std::fs::OpenOptions;
7use std::io::Read;
8use std::os::fd::AsRawFd;
9use std::os::unix::fs::OpenOptionsExt;
10use std::path::{Component, Path, PathBuf};
11use tracing::{debug, info, warn};
12
13/// Expected mount flags for audit verification.
14struct ExpectedMount {
15    path: &'static str,
16    required_flags: &'static [&'static str],
17    /// If true, the mount *must* exist in production mode. A missing critical
18    /// mount (e.g. /proc) is treated as a violation rather than silently skipped.
19    critical: bool,
20}
21
22/// Known mount paths and the flags they must carry in production mode.
23const PRODUCTION_MOUNT_EXPECTATIONS: &[ExpectedMount] = &[
24    ExpectedMount {
25        path: "/bin",
26        required_flags: &["ro", "nosuid", "nodev"],
27        critical: true,
28    },
29    ExpectedMount {
30        path: "/usr",
31        required_flags: &["ro", "nosuid", "nodev"],
32        critical: true,
33    },
34    ExpectedMount {
35        path: "/lib",
36        required_flags: &["ro", "nosuid", "nodev"],
37        critical: false, // not all rootfs layouts have /lib
38    },
39    ExpectedMount {
40        path: "/lib64",
41        required_flags: &["ro", "nosuid", "nodev"],
42        critical: false, // not all rootfs layouts have /lib64
43    },
44    ExpectedMount {
45        path: "/etc",
46        required_flags: &["ro", "nosuid", "nodev"],
47        critical: true,
48    },
49    ExpectedMount {
50        path: "/nix",
51        required_flags: &["ro", "nosuid", "nodev"],
52        critical: false, // only present on NixOS-based rootfs
53    },
54    ExpectedMount {
55        path: "/sbin",
56        required_flags: &["ro", "nosuid", "nodev"],
57        critical: false, // not all rootfs layouts have /sbin
58    },
59    ExpectedMount {
60        path: "/proc",
61        required_flags: &["nosuid", "nodev", "noexec"],
62        critical: true,
63    },
64    ExpectedMount {
65        path: "/run/secrets",
66        required_flags: &["nosuid", "nodev", "noexec"],
67        critical: false, // only present when secrets are configured
68    },
69];
70
71/// Normalize an absolute container destination path and reject traversal.
72///
73/// Returns a normalized absolute path containing only `RootDir` and `Normal`
74/// components. `.` segments are ignored; `..` and relative paths are rejected.
75pub fn normalize_container_destination(dest: &Path) -> Result<PathBuf> {
76    if !dest.is_absolute() {
77        return Err(NucleusError::ConfigError(format!(
78            "Container destination must be absolute: {:?}",
79            dest
80        )));
81    }
82
83    let mut normalized = PathBuf::from("/");
84    let mut saw_component = false;
85
86    for component in dest.components() {
87        match component {
88            Component::RootDir => {}
89            Component::CurDir => {}
90            Component::Normal(part) => {
91                normalized.push(part);
92                saw_component = true;
93            }
94            Component::ParentDir => {
95                return Err(NucleusError::ConfigError(format!(
96                    "Container destination must not contain parent traversal: {:?}",
97                    dest
98                )));
99            }
100            Component::Prefix(_) => {
101                return Err(NucleusError::ConfigError(format!(
102                    "Unsupported container destination prefix: {:?}",
103                    dest
104                )));
105            }
106        }
107    }
108
109    if !saw_component {
110        return Err(NucleusError::ConfigError(format!(
111            "Container destination must not be the root directory: {:?}",
112            dest
113        )));
114    }
115
116    Ok(normalized)
117}
118
119/// Resolve a validated container destination under a host-side root directory.
120pub fn resolve_container_destination(root: &Path, dest: &Path) -> Result<PathBuf> {
121    let normalized = normalize_container_destination(dest)?;
122    let relative = normalized.strip_prefix("/").map_err(|_| {
123        NucleusError::ConfigError(format!(
124            "Container destination is not absolute after normalization: {:?}",
125            normalized
126        ))
127    })?;
128    Ok(root.join(relative))
129}
130
131pub(crate) fn read_regular_file_nofollow(path: &Path) -> Result<Vec<u8>> {
132    let mut file = OpenOptions::new()
133        .read(true)
134        .custom_flags(libc::O_NOFOLLOW | libc::O_CLOEXEC)
135        .open(path)
136        .map_err(|e| {
137            NucleusError::FilesystemError(format!(
138                "Failed to open file {:?} with O_NOFOLLOW: {}",
139                path, e
140            ))
141        })?;
142
143    let metadata = file.metadata().map_err(|e| {
144        NucleusError::FilesystemError(format!("Failed to stat file {:?}: {}", path, e))
145    })?;
146    if !metadata.is_file() {
147        return Err(NucleusError::FilesystemError(format!(
148            "Expected regular file for {:?}, found non-file source",
149            path
150        )));
151    }
152
153    let mut content = Vec::new();
154    file.read_to_end(&mut content).map_err(|e| {
155        NucleusError::FilesystemError(format!("Failed to read file {:?}: {}", path, e))
156    })?;
157    Ok(content)
158}
159
160/// Audit all mounts in the container's mount namespace.
161///
162/// Reads /proc/self/mounts and verifies that each known mount point carries
163/// its expected flags. In production mode, any missing flag is fatal.
164/// Returns Ok(()) if all checks pass, or a list of violations.
165pub fn audit_mounts(production_mode: bool) -> Result<()> {
166    let mounts_content = std::fs::read_to_string("/proc/self/mounts").map_err(|e| {
167        NucleusError::FilesystemError(format!("Failed to read /proc/self/mounts: {}", e))
168    })?;
169
170    let mut violations = Vec::new();
171
172    for expectation in PRODUCTION_MOUNT_EXPECTATIONS {
173        // Find the mount entry for this path
174        let mount_entry = mounts_content.lines().find(|line| {
175            let parts: Vec<&str> = line.split_whitespace().collect();
176            parts.len() >= 4 && parts[1] == expectation.path
177        });
178
179        if let Some(entry) = mount_entry {
180            let parts: Vec<&str> = entry.split_whitespace().collect();
181            if parts.len() >= 4 {
182                let options = parts[3];
183                for &flag in expectation.required_flags {
184                    if !options.split(',').any(|opt| opt == flag) {
185                        violations.push(format!(
186                            "Mount {} missing required flag '{}' (has: {})",
187                            expectation.path, flag, options
188                        ));
189                    }
190                }
191            }
192        } else if expectation.critical && production_mode {
193            violations.push(format!(
194                "Critical mount {} is missing from the mount namespace",
195                expectation.path
196            ));
197        }
198    }
199
200    if violations.is_empty() {
201        info!("Mount audit passed: all expected flags verified");
202        Ok(())
203    } else if production_mode {
204        Err(NucleusError::FilesystemError(format!(
205            "Mount audit failed in production mode:\n  {}",
206            violations.join("\n  ")
207        )))
208    } else {
209        for v in &violations {
210            warn!("Mount audit: {}", v);
211        }
212        Ok(())
213    }
214}
215
216/// Create minimal filesystem structure in the new root
217pub fn create_minimal_fs(root: &Path) -> Result<()> {
218    info!("Creating minimal filesystem structure at {:?}", root);
219
220    // Create essential directories
221    let dirs = vec![
222        "dev",
223        "proc",
224        "sys",
225        "tmp",
226        "bin",
227        "sbin",
228        "usr",
229        "lib",
230        "lib64",
231        "etc",
232        "nix",
233        "nix/store",
234        "run",
235        "context",
236    ];
237
238    for dir in dirs {
239        let path = root.join(dir);
240        std::fs::create_dir_all(&path).map_err(|e| {
241            NucleusError::FilesystemError(format!("Failed to create directory {:?}: {}", path, e))
242        })?;
243    }
244
245    info!("Created minimal filesystem structure");
246
247    Ok(())
248}
249
250/// Create essential device nodes in /dev
251///
252/// In rootless mode, device node creation will fail gracefully
253pub fn create_dev_nodes(dev_path: &Path, include_tty: bool) -> Result<()> {
254    info!("Creating device nodes at {:?}", dev_path);
255
256    // Device nodes: (name, type, major, minor)
257    let mut devices = vec![
258        ("null", SFlag::S_IFCHR, 1, 3),
259        ("zero", SFlag::S_IFCHR, 1, 5),
260        ("full", SFlag::S_IFCHR, 1, 7),
261        ("random", SFlag::S_IFCHR, 1, 8),
262        ("urandom", SFlag::S_IFCHR, 1, 9),
263    ];
264    if include_tty {
265        devices.push(("tty", SFlag::S_IFCHR, 5, 0));
266    }
267
268    let mut created_count = 0;
269    let mut failed_count = 0;
270
271    for (name, dev_type, major, minor) in devices {
272        let path = dev_path.join(name);
273        let mode = Mode::from_bits_truncate(0o660);
274        let dev = makedev(major, minor);
275
276        match mknod(&path, dev_type, mode, dev) {
277            Ok(_) => {
278                info!("Created device node: {:?}", path);
279                created_count += 1;
280            }
281            Err(e) => {
282                // In rootless mode, mknod fails - this is expected
283                warn!(
284                    "Failed to create device node {:?}: {} (this is normal in rootless mode)",
285                    path, e
286                );
287                failed_count += 1;
288            }
289        }
290    }
291
292    if created_count > 0 {
293        info!("Successfully created {} device nodes", created_count);
294    }
295    if failed_count > 0 {
296        info!("Skipped {} device nodes (rootless mode)", failed_count);
297    }
298
299    Ok(())
300}
301
302/// Bind mount a pre-built rootfs (e.g. a Nix store closure) into the container.
303///
304/// Instead of exposing the full host /bin, /usr, /lib, /lib64, /nix, this mounts
305/// a minimal, purpose-built root filesystem. Suitable for production services.
306pub fn bind_mount_rootfs(root: &Path, rootfs_path: &Path) -> Result<()> {
307    info!(
308        "Bind mounting production rootfs {:?} into container {:?}",
309        rootfs_path, root
310    );
311
312    if std::fs::symlink_metadata(rootfs_path).is_err() {
313        return Err(NucleusError::FilesystemError(format!(
314            "Rootfs path does not exist: {:?}",
315            rootfs_path
316        )));
317    }
318
319    // Bind mount the rootfs contents into the container root.
320    // The rootfs is expected to contain a standard FHS layout (/bin, /lib, /etc, etc.)
321    // produced by a Nix buildEnv or similar.
322    let subdirs = ["bin", "sbin", "lib", "lib64", "usr", "etc", "nix"];
323
324    for subdir in &subdirs {
325        let source = rootfs_path.join(subdir);
326        if !source.exists() {
327            debug!("Rootfs subdir {} not present, skipping", subdir);
328            continue;
329        }
330
331        let target = root.join(subdir);
332        std::fs::create_dir_all(&target).map_err(|e| {
333            NucleusError::FilesystemError(format!(
334                "Failed to create mount point {:?}: {}",
335                target, e
336            ))
337        })?;
338
339        mount(
340            Some(&source),
341            &target,
342            None::<&str>,
343            MsFlags::MS_BIND | MsFlags::MS_REC,
344            None::<&str>,
345        )
346        .map_err(|e| {
347            NucleusError::FilesystemError(format!(
348                "Failed to bind mount rootfs {:?} -> {:?}: {}",
349                source, target, e
350            ))
351        })?;
352
353        // Remount read-only
354        mount(
355            None::<&str>,
356            &target,
357            None::<&str>,
358            MsFlags::MS_REMOUNT
359                | MsFlags::MS_BIND
360                | MsFlags::MS_RDONLY
361                | MsFlags::MS_REC
362                | MsFlags::MS_NOSUID
363                | MsFlags::MS_NODEV,
364            None::<&str>,
365        )
366        .map_err(|e| {
367            NucleusError::FilesystemError(format!(
368                "Failed to remount rootfs {:?} read-only: {}",
369                target, e
370            ))
371        })?;
372
373        info!("Mounted rootfs/{} read-only", subdir);
374    }
375
376    Ok(())
377}
378
379/// Bind mount essential host directories into container
380///
381/// This allows host binaries to be accessible inside the container.
382/// Used in agent mode. Production mode should use bind_mount_rootfs() instead.
383pub fn bind_mount_host_paths(root: &Path, best_effort: bool) -> Result<()> {
384    info!("Bind mounting host paths into container");
385
386    // Essential paths to bind mount (read-only)
387    let host_paths = vec![
388        "/bin", "/usr", "/lib", "/lib64", "/nix", // For NixOS
389    ];
390
391    for host_path in host_paths {
392        let host = Path::new(host_path);
393
394        // Only mount if the path exists on the host
395        if !host.exists() {
396            debug!("Skipping {} (not present on host)", host_path);
397            continue;
398        }
399
400        let container_path = root.join(host_path.trim_start_matches('/'));
401
402        // Create mount point
403        if let Err(e) = std::fs::create_dir_all(&container_path) {
404            if best_effort {
405                warn!("Failed to create mount point {:?}: {}", container_path, e);
406                continue;
407            }
408            return Err(NucleusError::FilesystemError(format!(
409                "Failed to create mount point {:?}: {}",
410                container_path, e
411            )));
412        }
413
414        // Attempt bind mount
415        // Note: Linux ignores MS_RDONLY on the initial bind mount call.
416        // A second remount is required to actually enforce read-only.
417        match mount(
418            Some(host),
419            &container_path,
420            None::<&str>,
421            MsFlags::MS_BIND | MsFlags::MS_REC,
422            None::<&str>,
423        ) {
424            Ok(_) => {
425                // Remount as read-only – required because MS_RDONLY is ignored on initial bind
426                mount(
427                    None::<&str>,
428                    &container_path,
429                    None::<&str>,
430                    MsFlags::MS_REMOUNT
431                        | MsFlags::MS_BIND
432                        | MsFlags::MS_RDONLY
433                        | MsFlags::MS_REC
434                        | MsFlags::MS_NOSUID
435                        | MsFlags::MS_NODEV,
436                    None::<&str>,
437                )
438                .map_err(|e| {
439                    NucleusError::FilesystemError(format!(
440                        "Failed to remount {} as read-only: {}",
441                        host_path, e
442                    ))
443                })?;
444                info!(
445                    "Bind mounted {} to {:?} (read-only)",
446                    host_path, container_path
447                );
448            }
449            Err(e) => {
450                if best_effort {
451                    warn!(
452                        "Failed to bind mount {}: {} (continuing anyway)",
453                        host_path, e
454                    );
455                } else {
456                    return Err(NucleusError::FilesystemError(format!(
457                        "Failed to bind mount {}: {}",
458                        host_path, e
459                    )));
460                }
461            }
462        }
463    }
464
465    Ok(())
466}
467
468/// H7: Sensitive host paths that must not be bind-mounted into containers.
469const DENIED_BIND_MOUNT_SOURCES_EXACT: &[&str] = &[
470    "/",
471    "/etc/shadow",
472    "/etc/sudoers",
473    "/etc/passwd",
474    "/etc/gshadow",
475];
476
477/// Sensitive host subtrees that must not be exposed to a container at all.
478const DENIED_BIND_MOUNT_SOURCE_PREFIXES: &[&str] = &["/proc", "/sys", "/dev", "/boot"];
479
480fn normalize_bind_mount_source_for_policy(source: &Path) -> Result<PathBuf> {
481    if !source.is_absolute() {
482        return Err(NucleusError::ConfigError(format!(
483            "Bind mount source must be absolute: {:?}",
484            source
485        )));
486    }
487
488    let mut normalized = PathBuf::from("/");
489
490    for component in source.components() {
491        match component {
492            Component::RootDir => {}
493            Component::CurDir => {}
494            Component::Normal(part) => normalized.push(part),
495            Component::ParentDir => {
496                normalized.pop();
497                if normalized.as_os_str().is_empty() {
498                    normalized.push("/");
499                }
500            }
501            Component::Prefix(_) => {
502                return Err(NucleusError::ConfigError(format!(
503                    "Unsupported bind mount source prefix: {:?}",
504                    source
505                )));
506            }
507        }
508    }
509
510    Ok(normalized)
511}
512
513fn reject_denied_bind_mount_source(source: &Path) -> Result<()> {
514    for denied in DENIED_BIND_MOUNT_SOURCES_EXACT {
515        if source == Path::new(denied) {
516            return Err(NucleusError::ConfigError(format!(
517                "Bind mount source '{}' is a sensitive host path and cannot be mounted into containers",
518                source.display()
519            )));
520        }
521    }
522
523    for denied in DENIED_BIND_MOUNT_SOURCE_PREFIXES {
524        let denied_path = Path::new(denied);
525        if source == denied_path || source.starts_with(denied_path) {
526            return Err(NucleusError::ConfigError(format!(
527                "Bind mount source '{}' is under sensitive host path '{}' and cannot be mounted into containers",
528                source.display(),
529                denied
530            )));
531        }
532    }
533
534    Ok(())
535}
536
537/// Validate that a bind mount source is not a sensitive host path or subtree.
538pub fn validate_bind_mount_source(source: &Path) -> Result<()> {
539    let normalized = normalize_bind_mount_source_for_policy(source)?;
540    reject_denied_bind_mount_source(&normalized)?;
541
542    let canonical = std::fs::canonicalize(source).map_err(|e| {
543        NucleusError::ConfigError(format!(
544            "Failed to resolve bind mount source {:?}: {}",
545            source, e
546        ))
547    })?;
548    reject_denied_bind_mount_source(&canonical)
549}
550
551/// Mount persistent bind volumes and ephemeral tmpfs volumes into the container root.
552pub fn mount_volumes(root: &Path, volumes: &[crate::container::VolumeMount]) -> Result<()> {
553    use crate::container::VolumeSource;
554
555    if volumes.is_empty() {
556        return Ok(());
557    }
558
559    info!("Mounting {} volume(s) into container", volumes.len());
560
561    for volume in volumes {
562        let dest = resolve_container_destination(root, &volume.dest)?;
563
564        match &volume.source {
565            VolumeSource::Bind { source } => {
566                // H7: Deny bind-mounting sensitive host paths
567                validate_bind_mount_source(source)?;
568
569                // Use symlink_metadata (lstat) instead of .exists() to avoid
570                // following symlinks in the existence check (O_NOFOLLOW semantics).
571                if std::fs::symlink_metadata(source).is_err() {
572                    return Err(NucleusError::FilesystemError(format!(
573                        "Volume source does not exist: {:?}",
574                        source
575                    )));
576                }
577
578                if let Some(parent) = dest.parent() {
579                    std::fs::create_dir_all(parent).map_err(|e| {
580                        NucleusError::FilesystemError(format!(
581                            "Failed to create volume mount parent {:?}: {}",
582                            parent, e
583                        ))
584                    })?;
585                }
586
587                let recursive = source.is_dir();
588                if source.is_file() {
589                    std::fs::write(&dest, "").map_err(|e| {
590                        NucleusError::FilesystemError(format!(
591                            "Failed to create volume mount point {:?}: {}",
592                            dest, e
593                        ))
594                    })?;
595                } else {
596                    std::fs::create_dir_all(&dest).map_err(|e| {
597                        NucleusError::FilesystemError(format!(
598                            "Failed to create volume mount dir {:?}: {}",
599                            dest, e
600                        ))
601                    })?;
602                }
603
604                let initial_flags = if recursive {
605                    MsFlags::MS_BIND | MsFlags::MS_REC
606                } else {
607                    MsFlags::MS_BIND
608                };
609                mount(
610                    Some(source.as_path()),
611                    &dest,
612                    None::<&str>,
613                    initial_flags,
614                    None::<&str>,
615                )
616                .map_err(|e| {
617                    NucleusError::FilesystemError(format!(
618                        "Failed to bind mount volume {:?} -> {:?}: {}",
619                        source, dest, e
620                    ))
621                })?;
622
623                let mut remount_flags =
624                    MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_NOSUID | MsFlags::MS_NODEV;
625                if recursive {
626                    remount_flags |= MsFlags::MS_REC;
627                }
628                if volume.read_only {
629                    remount_flags |= MsFlags::MS_RDONLY;
630                }
631
632                mount(
633                    None::<&str>,
634                    &dest,
635                    None::<&str>,
636                    remount_flags,
637                    None::<&str>,
638                )
639                .map_err(|e| {
640                    NucleusError::FilesystemError(format!(
641                        "Failed to remount volume {:?} with final flags: {}",
642                        dest, e
643                    ))
644                })?;
645
646                info!(
647                    "Mounted bind volume {:?} -> {:?} ({})",
648                    source,
649                    volume.dest,
650                    if volume.read_only { "ro" } else { "rw" }
651                );
652            }
653            VolumeSource::Tmpfs { size } => {
654                std::fs::create_dir_all(&dest).map_err(|e| {
655                    NucleusError::FilesystemError(format!(
656                        "Failed to create tmpfs mount dir {:?}: {}",
657                        dest, e
658                    ))
659                })?;
660
661                // M8: Validate size parameter to prevent option injection.
662                // Only allow digits, optionally followed by K/M/G suffix.
663                if let Some(value) = size.as_ref() {
664                    let valid = value
665                        .chars()
666                        .all(|c| c.is_ascii_digit() || "kKmMgG".contains(c));
667                    if !valid || value.is_empty() {
668                        return Err(NucleusError::FilesystemError(format!(
669                            "Invalid tmpfs size value '{}': only digits with optional K/M/G suffix allowed",
670                            value
671                        )));
672                    }
673                }
674
675                // M7: Default to 64MB instead of half of physical RAM to
676                // prevent memory DoS from unbounded tmpfs volumes.
677                let mount_data = size
678                    .as_ref()
679                    .map(|value| format!("size={},mode=0700", value))
680                    .unwrap_or_else(|| "size=64M,mode=0700".to_string());
681
682                let mut flags = MsFlags::MS_NOSUID | MsFlags::MS_NODEV;
683                if volume.read_only {
684                    flags |= MsFlags::MS_RDONLY;
685                }
686                mount(
687                    Some("tmpfs"),
688                    &dest,
689                    Some("tmpfs"),
690                    flags,
691                    Some(mount_data.as_str()),
692                )
693                .map_err(|e| {
694                    NucleusError::FilesystemError(format!(
695                        "Failed to mount tmpfs volume at {:?}: {}",
696                        dest, e
697                    ))
698                })?;
699
700                info!(
701                    "Mounted tmpfs volume at {:?}{}{}",
702                    volume.dest,
703                    size.as_ref()
704                        .map(|value| format!(" (size={})", value))
705                        .unwrap_or_default(),
706                    if volume.read_only { " (ro)" } else { "" }
707                );
708            }
709        }
710    }
711
712    Ok(())
713}
714
715/// Mount procfs at the given path
716///
717/// In rootless mode, procfs mounting should work due to user namespace capabilities.
718/// When `hide_pids` is true, mounts with hidepid=2 so processes cannot enumerate
719/// other PIDs (production hardening).
720pub fn mount_procfs(
721    proc_path: &Path,
722    best_effort: bool,
723    read_only: bool,
724    hide_pids: bool,
725) -> Result<()> {
726    info!(
727        "Mounting procfs at {:?} (hidepid={})",
728        proc_path,
729        if hide_pids { "2" } else { "0" }
730    );
731
732    let mount_data: Option<&str> = if hide_pids { Some("hidepid=2") } else { None };
733
734    match mount(
735        Some("proc"),
736        proc_path,
737        Some("proc"),
738        MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
739        mount_data,
740    ) {
741        Ok(_) => {
742            if read_only {
743                mount(
744                    None::<&str>,
745                    proc_path,
746                    None::<&str>,
747                    MsFlags::MS_REMOUNT
748                        | MsFlags::MS_RDONLY
749                        | MsFlags::MS_NOSUID
750                        | MsFlags::MS_NODEV
751                        | MsFlags::MS_NOEXEC,
752                    None::<&str>,
753                )
754                .map_err(|e| {
755                    NucleusError::FilesystemError(format!(
756                        "Failed to remount procfs read-only: {}",
757                        e
758                    ))
759                })?;
760                info!("Successfully mounted procfs (read-only)");
761            } else {
762                info!("Successfully mounted procfs");
763            }
764            Ok(())
765        }
766        Err(e) => {
767            if best_effort {
768                warn!("Failed to mount procfs: {} (continuing anyway)", e);
769                Ok(())
770            } else {
771                Err(NucleusError::FilesystemError(format!(
772                    "Failed to mount procfs: {}",
773                    e
774                )))
775            }
776        }
777    }
778}
779
780/// Paths to mask with /dev/null (files) – matches OCI runtime spec masked paths.
781/// Exposed for testing; the canonical list of sensitive /proc entries that must
782/// be hidden from container processes.
783pub const PROC_NULL_MASKED: &[&str] = &[
784    "kallsyms",
785    "kcore",
786    "sched_debug",
787    "timer_list",
788    "timer_stats",
789    "keys",
790    "latency_stats",
791    "config.gz",
792    "sysrq-trigger",
793    "kpagecount",
794    "kpageflags",
795    "kpagecgroup",
796];
797
798/// Paths to mask with empty tmpfs (directories).
799pub const PROC_TMPFS_MASKED: &[&str] = &["acpi", "bus", "irq", "scsi", "sys"];
800
801/// Mask sensitive /proc paths by bind-mounting /dev/null or tmpfs over them
802///
803/// This reduces kernel information leakage from the container. Follows OCI runtime
804/// conventions for masked paths.
805///
806/// SEC-06: When `production` is true, failures to mask critical paths
807/// (kcore, kallsyms, sysrq-trigger) are fatal instead of warn-and-continue.
808pub fn mask_proc_paths(proc_path: &Path, production: bool) -> Result<()> {
809    info!("Masking sensitive /proc paths");
810
811    const CRITICAL_PROC_PATHS: &[&str] = &["kcore", "kallsyms", "sysrq-trigger"];
812
813    let dev_null = Path::new("/dev/null");
814
815    for name in PROC_NULL_MASKED {
816        let target = proc_path.join(name);
817        if !target.exists() {
818            continue;
819        }
820        match mount(
821            Some(dev_null),
822            &target,
823            None::<&str>,
824            MsFlags::MS_BIND,
825            None::<&str>,
826        ) {
827            Ok(_) => {
828                // Remount read-only: Linux ignores MS_RDONLY on the initial bind mount,
829                // so a separate MS_REMOUNT|MS_BIND|MS_RDONLY call is required.
830                if let Err(e) = mount(
831                    None::<&str>,
832                    &target,
833                    None::<&str>,
834                    MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_RDONLY,
835                    None::<&str>,
836                ) {
837                    if production && CRITICAL_PROC_PATHS.contains(name) {
838                        return Err(NucleusError::FilesystemError(format!(
839                            "Failed to remount /proc/{} read-only in production mode: {}",
840                            name, e
841                        )));
842                    }
843                    warn!(
844                        "Failed to remount /proc/{} read-only: {} (continuing)",
845                        name, e
846                    );
847                }
848                debug!("Masked /proc/{} (read-only)", name);
849            }
850            Err(e) => {
851                if production && CRITICAL_PROC_PATHS.contains(name) {
852                    return Err(NucleusError::FilesystemError(format!(
853                        "Failed to mask critical /proc/{} in production mode: {}",
854                        name, e
855                    )));
856                }
857                warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
858            }
859        }
860    }
861
862    for name in PROC_TMPFS_MASKED {
863        let target = proc_path.join(name);
864        if !target.exists() {
865            continue;
866        }
867        match mount(
868            Some("tmpfs"),
869            &target,
870            Some("tmpfs"),
871            MsFlags::MS_RDONLY | MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
872            Some("size=0"),
873        ) {
874            Ok(_) => debug!("Masked /proc/{}", name),
875            Err(e) => {
876                if production {
877                    return Err(NucleusError::FilesystemError(format!(
878                        "Failed to mask /proc/{} in production mode: {}",
879                        name, e
880                    )));
881                }
882                warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
883            }
884        }
885    }
886
887    info!("Finished masking sensitive /proc paths");
888    Ok(())
889}
890
891/// Switch to new root filesystem using pivot_root or chroot
892///
893/// This implements the transition: populated -> pivoted
894/// Fails closed if root switching cannot be established.
895pub fn switch_root(new_root: &Path, allow_chroot_fallback: bool) -> Result<()> {
896    info!("Switching root to {:?}", new_root);
897
898    match pivot_root_impl(new_root) {
899        Ok(()) => {
900            info!("Successfully switched root using pivot_root");
901            Ok(())
902        }
903        Err(e) => {
904            if allow_chroot_fallback {
905                warn!(
906                    "pivot_root failed ({}), falling back to chroot due to explicit \
907                     configuration",
908                    e
909                );
910                chroot_impl(new_root)
911            } else {
912                Err(NucleusError::PivotRootError(format!(
913                    "pivot_root failed: {}. chroot fallback is disabled by default; use \
914                     --allow-chroot-fallback to allow weaker isolation",
915                    e
916                )))
917            }
918        }
919    }
920}
921
922/// Implement root switch using pivot_root(2)
923///
924/// pivot_root is preferred over chroot because:
925/// - More secure (old root can be unmounted)
926/// - Works better with mount namespaces
927fn pivot_root_impl(new_root: &Path) -> Result<()> {
928    use nix::unistd::pivot_root;
929
930    // pivot_root requires new_root to be a mount point
931    // and old_root to be under new_root
932
933    let old_root = new_root.join(".old_root");
934    std::fs::create_dir_all(&old_root).map_err(|e| {
935        NucleusError::PivotRootError(format!("Failed to create old_root directory: {}", e))
936    })?;
937
938    // Perform pivot_root
939    pivot_root(new_root, &old_root)
940        .map_err(|e| NucleusError::PivotRootError(format!("pivot_root syscall failed: {}", e)))?;
941
942    // Change to new root
943    std::env::set_current_dir("/")
944        .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
945
946    // Unmount old root
947    nix::mount::umount2("/.old_root", nix::mount::MntFlags::MNT_DETACH)
948        .map_err(|e| NucleusError::PivotRootError(format!("Failed to unmount old root: {}", e)))?;
949
950    // Remove old root directory
951    let _ = std::fs::remove_dir("/.old_root");
952
953    Ok(())
954}
955
956/// Implement root switch using chroot(2)
957///
958/// chroot is less secure than pivot_root but works in more situations
959fn chroot_impl(new_root: &Path) -> Result<()> {
960    chroot(new_root)
961        .map_err(|e| NucleusError::PivotRootError(format!("chroot syscall failed: {}", e)))?;
962
963    // Change to new root
964    std::env::set_current_dir("/")
965        .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
966
967    // L3: Drop CAP_SYS_CHROOT after chroot to prevent escape via nested chroot.
968    // Also close any FDs pointing outside the new root.
969    if let Err(e) = caps::drop(
970        None,
971        caps::CapSet::Bounding,
972        caps::Capability::CAP_SYS_CHROOT,
973    ) {
974        debug!(
975            "Could not drop CAP_SYS_CHROOT after chroot: {} (may not be present)",
976            e
977        );
978    }
979    if let Err(e) = caps::drop(
980        None,
981        caps::CapSet::Effective,
982        caps::Capability::CAP_SYS_CHROOT,
983    ) {
984        debug!(
985            "Could not drop effective CAP_SYS_CHROOT: {} (may not be present)",
986            e
987        );
988    }
989    if let Err(e) = caps::drop(
990        None,
991        caps::CapSet::Permitted,
992        caps::Capability::CAP_SYS_CHROOT,
993    ) {
994        debug!(
995            "Could not drop permitted CAP_SYS_CHROOT: {} (may not be present)",
996            e
997        );
998    }
999
1000    info!("Successfully switched root using chroot (CAP_SYS_CHROOT dropped)");
1001
1002    Ok(())
1003}
1004
1005/// Mount secret files into the container root.
1006///
1007/// Each secret is bind-mounted read-only from its source to the destination
1008/// path inside the container. Intermediate directories are created as needed.
1009pub fn mount_secrets(root: &Path, secrets: &[crate::container::SecretMount]) -> Result<()> {
1010    if secrets.is_empty() {
1011        return Ok(());
1012    }
1013
1014    info!("Mounting {} secret(s) into container", secrets.len());
1015
1016    for secret in secrets {
1017        let source_fd = open(
1018            &secret.source,
1019            OFlag::O_PATH | OFlag::O_NOFOLLOW | OFlag::O_CLOEXEC,
1020            Mode::empty(),
1021        )
1022        .map_err(|e| {
1023            NucleusError::FilesystemError(format!(
1024                "Failed to open secret source {:?} with O_NOFOLLOW: {}",
1025                secret.source, e
1026            ))
1027        })?;
1028        let source_stat = fstat(&source_fd).map_err(|e| {
1029            NucleusError::FilesystemError(format!(
1030                "Failed to stat secret source {:?}: {}",
1031                secret.source, e
1032            ))
1033        })?;
1034        let source_kind = SFlag::from_bits_truncate(source_stat.st_mode);
1035        let source_is_file = source_kind == SFlag::S_IFREG;
1036        let source_is_dir = source_kind == SFlag::S_IFDIR;
1037        if !source_is_file && !source_is_dir {
1038            return Err(NucleusError::FilesystemError(format!(
1039                "Secret source {:?} must be a regular file or directory",
1040                secret.source
1041            )));
1042        }
1043        let source_fd_path = PathBuf::from(format!("/proc/self/fd/{}", source_fd.as_raw_fd()));
1044
1045        // Destination inside container root
1046        let dest = resolve_container_destination(root, &secret.dest)?;
1047
1048        // Create parent directories
1049        if let Some(parent) = dest.parent() {
1050            std::fs::create_dir_all(parent).map_err(|e| {
1051                NucleusError::FilesystemError(format!(
1052                    "Failed to create secret mount parent {:?}: {}",
1053                    parent, e
1054                ))
1055            })?;
1056        }
1057
1058        // Create mount point file
1059        if source_is_file {
1060            std::fs::write(&dest, "").map_err(|e| {
1061                NucleusError::FilesystemError(format!(
1062                    "Failed to create secret mount point {:?}: {}",
1063                    dest, e
1064                ))
1065            })?;
1066        } else {
1067            std::fs::create_dir_all(&dest).map_err(|e| {
1068                NucleusError::FilesystemError(format!(
1069                    "Failed to create secret mount dir {:?}: {}",
1070                    dest, e
1071                ))
1072            })?;
1073        }
1074
1075        // Bind mount read-only
1076        mount(
1077            Some(source_fd_path.as_path()),
1078            &dest,
1079            None::<&str>,
1080            MsFlags::MS_BIND,
1081            None::<&str>,
1082        )
1083        .map_err(|e| {
1084            NucleusError::FilesystemError(format!(
1085                "Failed to bind mount secret {:?}: {}",
1086                secret.source, e
1087            ))
1088        })?;
1089
1090        mount(
1091            None::<&str>,
1092            &dest,
1093            None::<&str>,
1094            MsFlags::MS_REMOUNT
1095                | MsFlags::MS_BIND
1096                | MsFlags::MS_RDONLY
1097                | MsFlags::MS_NOSUID
1098                | MsFlags::MS_NODEV
1099                | MsFlags::MS_NOEXEC,
1100            None::<&str>,
1101        )
1102        .map_err(|e| {
1103            NucleusError::FilesystemError(format!(
1104                "Failed to remount secret {:?} read-only: {}",
1105                dest, e
1106            ))
1107        })?;
1108
1109        // Apply configured file permissions on the mount point
1110        if source_is_file {
1111            use std::os::unix::fs::PermissionsExt;
1112            let perms = std::fs::Permissions::from_mode(secret.mode);
1113            if let Err(e) = std::fs::set_permissions(&dest, perms) {
1114                warn!(
1115                    "Failed to set mode {:04o} on secret {:?}: {} (bind mount may override)",
1116                    secret.mode, dest, e
1117                );
1118            }
1119        }
1120
1121        debug!(
1122            "Mounted secret {:?} -> {:?} (mode {:04o})",
1123            secret.source, secret.dest, secret.mode
1124        );
1125    }
1126
1127    Ok(())
1128}
1129
1130/// Mount secrets onto a dedicated in-memory tmpfs instead of bind-mounting host paths.
1131///
1132/// Creates a per-container tmpfs at `<root>/run/secrets` with MS_NOEXEC | MS_NOSUID | MS_NODEV,
1133/// copies secret contents into it, then zeros the read buffer. This ensures secrets
1134/// never reference host-side files after setup and are never persisted to disk.
1135pub fn mount_secrets_inmemory(
1136    root: &Path,
1137    secrets: &[crate::container::SecretMount],
1138    identity: &crate::container::ProcessIdentity,
1139) -> Result<()> {
1140    if secrets.is_empty() {
1141        return Ok(());
1142    }
1143
1144    info!("Mounting {} secret(s) on in-memory tmpfs", secrets.len());
1145
1146    let secrets_dir = root.join("run/secrets");
1147    std::fs::create_dir_all(&secrets_dir).map_err(|e| {
1148        NucleusError::FilesystemError(format!(
1149            "Failed to create secrets dir {:?}: {}",
1150            secrets_dir, e
1151        ))
1152    })?;
1153
1154    // Mount a size-limited tmpfs for secrets (16 MiB max)
1155    if let Err(e) = mount(
1156        Some("tmpfs"),
1157        &secrets_dir,
1158        Some("tmpfs"),
1159        MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
1160        Some("size=16m,mode=0700"),
1161    ) {
1162        let _ = std::fs::remove_dir_all(&secrets_dir);
1163        return Err(NucleusError::FilesystemError(format!(
1164            "Failed to mount secrets tmpfs at {:?}: {}",
1165            secrets_dir, e
1166        )));
1167    }
1168
1169    if !identity.is_root() {
1170        nix::unistd::chown(
1171            &secrets_dir,
1172            Some(nix::unistd::Uid::from_raw(identity.uid)),
1173            Some(nix::unistd::Gid::from_raw(identity.gid)),
1174        )
1175        .map_err(|e| {
1176            let _ = nix::mount::umount2(&secrets_dir, nix::mount::MntFlags::MNT_DETACH);
1177            let _ = std::fs::remove_dir_all(&secrets_dir);
1178            NucleusError::FilesystemError(format!(
1179                "Failed to set /run/secrets owner to {}:{}: {}",
1180                identity.uid, identity.gid, e
1181            ))
1182        })?;
1183    }
1184
1185    // Rollback: unmount tmpfs and remove dir if any secret fails
1186    let result = mount_secrets_inmemory_inner(&secrets_dir, root, secrets, identity);
1187    if let Err(ref e) = result {
1188        let _ = nix::mount::umount2(&secrets_dir, nix::mount::MntFlags::MNT_DETACH);
1189        let _ = std::fs::remove_dir_all(&secrets_dir);
1190        return Err(NucleusError::FilesystemError(format!(
1191            "Secret mount failed (rolled back): {}",
1192            e
1193        )));
1194    }
1195
1196    info!("All secrets mounted on in-memory tmpfs");
1197    Ok(())
1198}
1199
1200fn mount_secrets_inmemory_inner(
1201    secrets_dir: &Path,
1202    root: &Path,
1203    secrets: &[crate::container::SecretMount],
1204    identity: &crate::container::ProcessIdentity,
1205) -> Result<()> {
1206    for secret in secrets {
1207        let mut content = read_regular_file_nofollow(&secret.source)?;
1208
1209        // Determine destination path inside the secrets tmpfs
1210        let dest = resolve_container_destination(secrets_dir, &secret.dest)?;
1211
1212        // Create parent directories within the tmpfs
1213        if let Some(parent) = dest.parent() {
1214            std::fs::create_dir_all(parent).map_err(|e| {
1215                NucleusError::FilesystemError(format!(
1216                    "Failed to create secret parent dir {:?}: {}",
1217                    parent, e
1218                ))
1219            })?;
1220        }
1221
1222        // Write secret content to tmpfs
1223        std::fs::write(&dest, &content).map_err(|e| {
1224            NucleusError::FilesystemError(format!("Failed to write secret to {:?}: {}", dest, e))
1225        })?;
1226
1227        // Set permissions
1228        {
1229            use std::os::unix::fs::PermissionsExt;
1230            let perms = std::fs::Permissions::from_mode(secret.mode);
1231            std::fs::set_permissions(&dest, perms).map_err(|e| {
1232                NucleusError::FilesystemError(format!(
1233                    "Failed to set permissions on secret {:?}: {}",
1234                    dest, e
1235                ))
1236            })?;
1237        }
1238
1239        if !identity.is_root() {
1240            nix::unistd::chown(
1241                &dest,
1242                Some(nix::unistd::Uid::from_raw(identity.uid)),
1243                Some(nix::unistd::Gid::from_raw(identity.gid)),
1244            )
1245            .map_err(|e| {
1246                NucleusError::FilesystemError(format!(
1247                    "Failed to set permissions owner on secret {:?} to {}:{}: {}",
1248                    dest, identity.uid, identity.gid, e
1249                ))
1250            })?;
1251        }
1252
1253        // Zero the in-memory buffer
1254        zeroize::Zeroize::zeroize(&mut content);
1255        drop(content);
1256
1257        // Also bind-mount the secret to its expected container path for compatibility
1258        let container_dest = resolve_container_destination(root, &secret.dest)?;
1259        if container_dest != dest {
1260            if let Some(parent) = container_dest.parent() {
1261                std::fs::create_dir_all(parent).map_err(|e| {
1262                    NucleusError::FilesystemError(format!(
1263                        "Failed to create secret mount parent {:?}: {}",
1264                        parent, e
1265                    ))
1266                })?;
1267            }
1268
1269            std::fs::write(&container_dest, "").map_err(|e| {
1270                NucleusError::FilesystemError(format!(
1271                    "Failed to create secret mount point {:?}: {}",
1272                    container_dest, e
1273                ))
1274            })?;
1275
1276            mount(
1277                Some(dest.as_path()),
1278                &container_dest,
1279                None::<&str>,
1280                MsFlags::MS_BIND,
1281                None::<&str>,
1282            )
1283            .map_err(|e| {
1284                NucleusError::FilesystemError(format!(
1285                    "Failed to bind mount secret {:?} -> {:?}: {}",
1286                    dest, container_dest, e
1287                ))
1288            })?;
1289
1290            mount(
1291                None::<&str>,
1292                &container_dest,
1293                None::<&str>,
1294                MsFlags::MS_REMOUNT
1295                    | MsFlags::MS_BIND
1296                    | MsFlags::MS_RDONLY
1297                    | MsFlags::MS_NOSUID
1298                    | MsFlags::MS_NODEV
1299                    | MsFlags::MS_NOEXEC,
1300                None::<&str>,
1301            )
1302            .map_err(|e| {
1303                NucleusError::FilesystemError(format!(
1304                    "Failed to remount secret {:?} read-only: {}",
1305                    container_dest, e
1306                ))
1307            })?;
1308        }
1309
1310        debug!(
1311            "Secret {:?} -> {:?} (in-memory tmpfs, mode {:04o})",
1312            secret.source, secret.dest, secret.mode
1313        );
1314    }
1315
1316    Ok(())
1317}
1318
1319#[cfg(test)]
1320mod tests {
1321    use super::*;
1322    use std::os::unix::fs::symlink;
1323
1324    #[test]
1325    fn test_validate_bind_mount_source_rejects_sensitive_subtrees() {
1326        for path in ["/proc/sys", "/sys/fs/cgroup", "/dev/kmsg", "/boot"] {
1327            let err = validate_bind_mount_source(Path::new(path)).unwrap_err();
1328            assert!(
1329                err.to_string().contains("sensitive host path"),
1330                "expected sensitive-path rejection for {path}, got: {err}"
1331            );
1332        }
1333    }
1334
1335    #[test]
1336    fn test_validate_bind_mount_source_allows_regular_host_paths() {
1337        let temp = tempfile::TempDir::new().unwrap();
1338        let safe_path = temp.path().join("data");
1339        std::fs::create_dir(&safe_path).unwrap();
1340
1341        validate_bind_mount_source(&safe_path).unwrap();
1342    }
1343
1344    #[test]
1345    fn test_validate_bind_mount_source_normalizes_parent_components_before_filtering() {
1346        let temp = tempfile::TempDir::new().unwrap();
1347        let safe_path = temp.path().join("data");
1348        std::fs::create_dir(&safe_path).unwrap();
1349
1350        validate_bind_mount_source(&safe_path.join("../data")).unwrap();
1351    }
1352
1353    #[test]
1354    fn test_proc_mask_includes_sysrq_trigger() {
1355        assert!(
1356            PROC_NULL_MASKED.contains(&"sysrq-trigger"),
1357            "/proc/sysrq-trigger must be masked to prevent host DoS"
1358        );
1359    }
1360
1361    #[test]
1362    fn test_proc_mask_includes_timer_stats() {
1363        assert!(
1364            PROC_NULL_MASKED.contains(&"timer_stats"),
1365            "/proc/timer_stats must be masked to prevent kernel info leakage"
1366        );
1367    }
1368
1369    #[test]
1370    fn test_proc_mask_includes_kpage_files() {
1371        for path in &["kpagecount", "kpageflags", "kpagecgroup"] {
1372            assert!(
1373                PROC_NULL_MASKED.contains(path),
1374                "/proc/{} must be masked to prevent host memory layout leakage",
1375                path
1376            );
1377        }
1378    }
1379
1380    #[test]
1381    fn test_proc_mask_includes_oci_standard_paths() {
1382        // OCI runtime spec required masked paths
1383        for path in &["kallsyms", "kcore", "sched_debug", "keys", "config.gz"] {
1384            assert!(
1385                PROC_NULL_MASKED.contains(path),
1386                "/proc/{} must be in null-masked list (OCI spec)",
1387                path
1388            );
1389        }
1390        for path in &["acpi", "bus", "scsi", "sys"] {
1391            assert!(
1392                PROC_TMPFS_MASKED.contains(path),
1393                "/proc/{} must be in tmpfs-masked list (OCI spec)",
1394                path
1395            );
1396        }
1397    }
1398
1399    #[test]
1400    fn test_read_regular_file_nofollow_reads_regular_file() {
1401        let temp = tempfile::tempdir().unwrap();
1402        let path = temp.path().join("secret.txt");
1403        std::fs::write(&path, "supersecret").unwrap();
1404
1405        let content = read_regular_file_nofollow(&path).unwrap();
1406        assert_eq!(content, b"supersecret");
1407    }
1408
1409    #[test]
1410    fn test_read_regular_file_nofollow_rejects_symlink() {
1411        let temp = tempfile::tempdir().unwrap();
1412        let target = temp.path().join("target.txt");
1413        let link = temp.path().join("secret-link");
1414        std::fs::write(&target, "supersecret").unwrap();
1415        symlink(&target, &link).unwrap();
1416
1417        let err = read_regular_file_nofollow(&link).unwrap_err();
1418        assert!(
1419            err.to_string().contains("O_NOFOLLOW"),
1420            "symlink reads must fail via O_NOFOLLOW"
1421        );
1422    }
1423}