Skip to main content

nucleus/filesystem/
mount.rs

1use crate::error::{NucleusError, Result};
2use nix::mount::{mount, MsFlags};
3use nix::sys::stat::{makedev, mknod, Mode, SFlag};
4use nix::unistd::chroot;
5use std::path::{Component, Path, PathBuf};
6use tracing::{debug, info, warn};
7
8/// Expected mount flags for audit verification.
9struct ExpectedMount {
10    path: &'static str,
11    required_flags: &'static [&'static str],
12    /// If true, the mount *must* exist in production mode. A missing critical
13    /// mount (e.g. /proc) is treated as a violation rather than silently skipped.
14    critical: bool,
15}
16
17/// Known mount paths and the flags they must carry in production mode.
18const PRODUCTION_MOUNT_EXPECTATIONS: &[ExpectedMount] = &[
19    ExpectedMount {
20        path: "/bin",
21        required_flags: &["ro", "nosuid", "nodev"],
22        critical: true,
23    },
24    ExpectedMount {
25        path: "/usr",
26        required_flags: &["ro", "nosuid", "nodev"],
27        critical: true,
28    },
29    ExpectedMount {
30        path: "/lib",
31        required_flags: &["ro", "nosuid", "nodev"],
32        critical: false, // not all rootfs layouts have /lib
33    },
34    ExpectedMount {
35        path: "/lib64",
36        required_flags: &["ro", "nosuid", "nodev"],
37        critical: false, // not all rootfs layouts have /lib64
38    },
39    ExpectedMount {
40        path: "/etc",
41        required_flags: &["ro", "nosuid", "nodev"],
42        critical: true,
43    },
44    ExpectedMount {
45        path: "/nix",
46        required_flags: &["ro", "nosuid", "nodev"],
47        critical: false, // only present on NixOS-based rootfs
48    },
49    ExpectedMount {
50        path: "/sbin",
51        required_flags: &["ro", "nosuid", "nodev"],
52        critical: false, // not all rootfs layouts have /sbin
53    },
54    ExpectedMount {
55        path: "/proc",
56        required_flags: &["nosuid", "nodev", "noexec"],
57        critical: true,
58    },
59    ExpectedMount {
60        path: "/run/secrets",
61        required_flags: &["nosuid", "nodev", "noexec"],
62        critical: false, // only present when secrets are configured
63    },
64];
65
66/// Normalize an absolute container destination path and reject traversal.
67///
68/// Returns a normalized absolute path containing only `RootDir` and `Normal`
69/// components. `.` segments are ignored; `..` and relative paths are rejected.
70pub fn normalize_container_destination(dest: &Path) -> Result<PathBuf> {
71    if !dest.is_absolute() {
72        return Err(NucleusError::ConfigError(format!(
73            "Container destination must be absolute: {:?}",
74            dest
75        )));
76    }
77
78    let mut normalized = PathBuf::from("/");
79    let mut saw_component = false;
80
81    for component in dest.components() {
82        match component {
83            Component::RootDir => {}
84            Component::CurDir => {}
85            Component::Normal(part) => {
86                normalized.push(part);
87                saw_component = true;
88            }
89            Component::ParentDir => {
90                return Err(NucleusError::ConfigError(format!(
91                    "Container destination must not contain parent traversal: {:?}",
92                    dest
93                )));
94            }
95            Component::Prefix(_) => {
96                return Err(NucleusError::ConfigError(format!(
97                    "Unsupported container destination prefix: {:?}",
98                    dest
99                )));
100            }
101        }
102    }
103
104    if !saw_component {
105        return Err(NucleusError::ConfigError(format!(
106            "Container destination must not be the root directory: {:?}",
107            dest
108        )));
109    }
110
111    Ok(normalized)
112}
113
114/// Resolve a validated container destination under a host-side root directory.
115pub fn resolve_container_destination(root: &Path, dest: &Path) -> Result<PathBuf> {
116    let normalized = normalize_container_destination(dest)?;
117    let relative = normalized.strip_prefix("/").map_err(|_| {
118        NucleusError::ConfigError(format!(
119            "Container destination is not absolute after normalization: {:?}",
120            normalized
121        ))
122    })?;
123    Ok(root.join(relative))
124}
125
126/// Audit all mounts in the container's mount namespace.
127///
128/// Reads /proc/self/mounts and verifies that each known mount point carries
129/// its expected flags. In production mode, any missing flag is fatal.
130/// Returns Ok(()) if all checks pass, or a list of violations.
131pub fn audit_mounts(production_mode: bool) -> Result<()> {
132    let mounts_content = std::fs::read_to_string("/proc/self/mounts").map_err(|e| {
133        NucleusError::FilesystemError(format!("Failed to read /proc/self/mounts: {}", e))
134    })?;
135
136    let mut violations = Vec::new();
137
138    for expectation in PRODUCTION_MOUNT_EXPECTATIONS {
139        // Find the mount entry for this path
140        let mount_entry = mounts_content.lines().find(|line| {
141            let parts: Vec<&str> = line.split_whitespace().collect();
142            parts.len() >= 4 && parts[1] == expectation.path
143        });
144
145        if let Some(entry) = mount_entry {
146            let parts: Vec<&str> = entry.split_whitespace().collect();
147            if parts.len() >= 4 {
148                let options = parts[3];
149                for &flag in expectation.required_flags {
150                    if !options.split(',').any(|opt| opt == flag) {
151                        violations.push(format!(
152                            "Mount {} missing required flag '{}' (has: {})",
153                            expectation.path, flag, options
154                        ));
155                    }
156                }
157            }
158        } else if expectation.critical && production_mode {
159            violations.push(format!(
160                "Critical mount {} is missing from the mount namespace",
161                expectation.path
162            ));
163        }
164    }
165
166    if violations.is_empty() {
167        info!("Mount audit passed: all expected flags verified");
168        Ok(())
169    } else if production_mode {
170        Err(NucleusError::FilesystemError(format!(
171            "Mount audit failed in production mode:\n  {}",
172            violations.join("\n  ")
173        )))
174    } else {
175        for v in &violations {
176            warn!("Mount audit: {}", v);
177        }
178        Ok(())
179    }
180}
181
182/// Create minimal filesystem structure in the new root
183pub fn create_minimal_fs(root: &Path) -> Result<()> {
184    info!("Creating minimal filesystem structure at {:?}", root);
185
186    // Create essential directories
187    let dirs = vec![
188        "dev",
189        "proc",
190        "sys",
191        "tmp",
192        "bin",
193        "sbin",
194        "usr",
195        "lib",
196        "lib64",
197        "etc",
198        "nix",
199        "nix/store",
200        "run",
201        "context",
202    ];
203
204    for dir in dirs {
205        let path = root.join(dir);
206        std::fs::create_dir_all(&path).map_err(|e| {
207            NucleusError::FilesystemError(format!("Failed to create directory {:?}: {}", path, e))
208        })?;
209    }
210
211    info!("Created minimal filesystem structure");
212
213    Ok(())
214}
215
216/// Create essential device nodes in /dev
217///
218/// In rootless mode, device node creation will fail gracefully
219pub fn create_dev_nodes(dev_path: &Path, include_tty: bool) -> Result<()> {
220    info!("Creating device nodes at {:?}", dev_path);
221
222    // Device nodes: (name, type, major, minor)
223    let mut devices = vec![
224        ("null", SFlag::S_IFCHR, 1, 3),
225        ("zero", SFlag::S_IFCHR, 1, 5),
226        ("full", SFlag::S_IFCHR, 1, 7),
227        ("random", SFlag::S_IFCHR, 1, 8),
228        ("urandom", SFlag::S_IFCHR, 1, 9),
229    ];
230    if include_tty {
231        devices.push(("tty", SFlag::S_IFCHR, 5, 0));
232    }
233
234    let mut created_count = 0;
235    let mut failed_count = 0;
236
237    for (name, dev_type, major, minor) in devices {
238        let path = dev_path.join(name);
239        let mode = Mode::from_bits_truncate(0o660);
240        let dev = makedev(major, minor);
241
242        match mknod(&path, dev_type, mode, dev) {
243            Ok(_) => {
244                info!("Created device node: {:?}", path);
245                created_count += 1;
246            }
247            Err(e) => {
248                // In rootless mode, mknod fails - this is expected
249                warn!(
250                    "Failed to create device node {:?}: {} (this is normal in rootless mode)",
251                    path, e
252                );
253                failed_count += 1;
254            }
255        }
256    }
257
258    if created_count > 0 {
259        info!("Successfully created {} device nodes", created_count);
260    }
261    if failed_count > 0 {
262        info!("Skipped {} device nodes (rootless mode)", failed_count);
263    }
264
265    Ok(())
266}
267
268/// Bind mount a pre-built rootfs (e.g. a Nix store closure) into the container.
269///
270/// Instead of exposing the full host /bin, /usr, /lib, /lib64, /nix, this mounts
271/// a minimal, purpose-built root filesystem. Suitable for production services.
272pub fn bind_mount_rootfs(root: &Path, rootfs_path: &Path) -> Result<()> {
273    info!(
274        "Bind mounting production rootfs {:?} into container {:?}",
275        rootfs_path, root
276    );
277
278    if std::fs::symlink_metadata(rootfs_path).is_err() {
279        return Err(NucleusError::FilesystemError(format!(
280            "Rootfs path does not exist: {:?}",
281            rootfs_path
282        )));
283    }
284
285    // Bind mount the rootfs contents into the container root.
286    // The rootfs is expected to contain a standard FHS layout (/bin, /lib, /etc, etc.)
287    // produced by a Nix buildEnv or similar.
288    let subdirs = ["bin", "sbin", "lib", "lib64", "usr", "etc", "nix"];
289
290    for subdir in &subdirs {
291        let source = rootfs_path.join(subdir);
292        if !source.exists() {
293            debug!("Rootfs subdir {} not present, skipping", subdir);
294            continue;
295        }
296
297        let target = root.join(subdir);
298        std::fs::create_dir_all(&target).map_err(|e| {
299            NucleusError::FilesystemError(format!(
300                "Failed to create mount point {:?}: {}",
301                target, e
302            ))
303        })?;
304
305        mount(
306            Some(&source),
307            &target,
308            None::<&str>,
309            MsFlags::MS_BIND | MsFlags::MS_REC,
310            None::<&str>,
311        )
312        .map_err(|e| {
313            NucleusError::FilesystemError(format!(
314                "Failed to bind mount rootfs {:?} -> {:?}: {}",
315                source, target, e
316            ))
317        })?;
318
319        // Remount read-only
320        mount(
321            None::<&str>,
322            &target,
323            None::<&str>,
324            MsFlags::MS_REMOUNT
325                | MsFlags::MS_BIND
326                | MsFlags::MS_RDONLY
327                | MsFlags::MS_REC
328                | MsFlags::MS_NOSUID
329                | MsFlags::MS_NODEV,
330            None::<&str>,
331        )
332        .map_err(|e| {
333            NucleusError::FilesystemError(format!(
334                "Failed to remount rootfs {:?} read-only: {}",
335                target, e
336            ))
337        })?;
338
339        info!("Mounted rootfs/{} read-only", subdir);
340    }
341
342    Ok(())
343}
344
345/// Bind mount essential host directories into container
346///
347/// This allows host binaries to be accessible inside the container.
348/// Used in agent mode. Production mode should use bind_mount_rootfs() instead.
349pub fn bind_mount_host_paths(root: &Path, best_effort: bool) -> Result<()> {
350    info!("Bind mounting host paths into container");
351
352    // Essential paths to bind mount (read-only)
353    let host_paths = vec![
354        "/bin", "/usr", "/lib", "/lib64", "/nix", // For NixOS
355    ];
356
357    for host_path in host_paths {
358        let host = Path::new(host_path);
359
360        // Only mount if the path exists on the host
361        if !host.exists() {
362            debug!("Skipping {} (not present on host)", host_path);
363            continue;
364        }
365
366        let container_path = root.join(host_path.trim_start_matches('/'));
367
368        // Create mount point
369        if let Err(e) = std::fs::create_dir_all(&container_path) {
370            if best_effort {
371                warn!("Failed to create mount point {:?}: {}", container_path, e);
372                continue;
373            }
374            return Err(NucleusError::FilesystemError(format!(
375                "Failed to create mount point {:?}: {}",
376                container_path, e
377            )));
378        }
379
380        // Attempt bind mount
381        // Note: Linux ignores MS_RDONLY on the initial bind mount call.
382        // A second remount is required to actually enforce read-only.
383        match mount(
384            Some(host),
385            &container_path,
386            None::<&str>,
387            MsFlags::MS_BIND | MsFlags::MS_REC,
388            None::<&str>,
389        ) {
390            Ok(_) => {
391                // Remount as read-only – required because MS_RDONLY is ignored on initial bind
392                mount(
393                    None::<&str>,
394                    &container_path,
395                    None::<&str>,
396                    MsFlags::MS_REMOUNT
397                        | MsFlags::MS_BIND
398                        | MsFlags::MS_RDONLY
399                        | MsFlags::MS_REC
400                        | MsFlags::MS_NOSUID
401                        | MsFlags::MS_NODEV,
402                    None::<&str>,
403                )
404                .map_err(|e| {
405                    NucleusError::FilesystemError(format!(
406                        "Failed to remount {} as read-only: {}",
407                        host_path, e
408                    ))
409                })?;
410                info!(
411                    "Bind mounted {} to {:?} (read-only)",
412                    host_path, container_path
413                );
414            }
415            Err(e) => {
416                if best_effort {
417                    warn!(
418                        "Failed to bind mount {}: {} (continuing anyway)",
419                        host_path, e
420                    );
421                } else {
422                    return Err(NucleusError::FilesystemError(format!(
423                        "Failed to bind mount {}: {}",
424                        host_path, e
425                    )));
426                }
427            }
428        }
429    }
430
431    Ok(())
432}
433
434/// H7: Sensitive host paths that must not be bind-mounted into containers.
435const DENIED_BIND_MOUNT_SOURCES: &[&str] = &[
436    "/",
437    "/proc",
438    "/sys",
439    "/dev",
440    "/boot",
441    "/etc/shadow",
442    "/etc/sudoers",
443    "/etc/passwd",
444    "/etc/gshadow",
445];
446
447/// Validate that a bind mount source is not a sensitive host path.
448fn validate_bind_mount_source(source: &Path) -> Result<()> {
449    let source_str = source.to_string_lossy();
450    for denied in DENIED_BIND_MOUNT_SOURCES {
451        if source_str == *denied {
452            return Err(NucleusError::FilesystemError(format!(
453                "Bind mount source '{}' is a sensitive host path and cannot be mounted into containers",
454                source.display()
455            )));
456        }
457    }
458    Ok(())
459}
460
461/// Mount persistent bind volumes and ephemeral tmpfs volumes into the container root.
462pub fn mount_volumes(root: &Path, volumes: &[crate::container::VolumeMount]) -> Result<()> {
463    use crate::container::VolumeSource;
464
465    if volumes.is_empty() {
466        return Ok(());
467    }
468
469    info!("Mounting {} volume(s) into container", volumes.len());
470
471    for volume in volumes {
472        let dest = resolve_container_destination(root, &volume.dest)?;
473
474        match &volume.source {
475            VolumeSource::Bind { source } => {
476                // H7: Deny bind-mounting sensitive host paths
477                validate_bind_mount_source(source)?;
478
479                // Use symlink_metadata (lstat) instead of .exists() to avoid
480                // following symlinks in the existence check (O_NOFOLLOW semantics).
481                if std::fs::symlink_metadata(source).is_err() {
482                    return Err(NucleusError::FilesystemError(format!(
483                        "Volume source does not exist: {:?}",
484                        source
485                    )));
486                }
487
488                if let Some(parent) = dest.parent() {
489                    std::fs::create_dir_all(parent).map_err(|e| {
490                        NucleusError::FilesystemError(format!(
491                            "Failed to create volume mount parent {:?}: {}",
492                            parent, e
493                        ))
494                    })?;
495                }
496
497                let recursive = source.is_dir();
498                if source.is_file() {
499                    std::fs::write(&dest, "").map_err(|e| {
500                        NucleusError::FilesystemError(format!(
501                            "Failed to create volume mount point {:?}: {}",
502                            dest, e
503                        ))
504                    })?;
505                } else {
506                    std::fs::create_dir_all(&dest).map_err(|e| {
507                        NucleusError::FilesystemError(format!(
508                            "Failed to create volume mount dir {:?}: {}",
509                            dest, e
510                        ))
511                    })?;
512                }
513
514                let initial_flags = if recursive {
515                    MsFlags::MS_BIND | MsFlags::MS_REC
516                } else {
517                    MsFlags::MS_BIND
518                };
519                mount(
520                    Some(source.as_path()),
521                    &dest,
522                    None::<&str>,
523                    initial_flags,
524                    None::<&str>,
525                )
526                .map_err(|e| {
527                    NucleusError::FilesystemError(format!(
528                        "Failed to bind mount volume {:?} -> {:?}: {}",
529                        source, dest, e
530                    ))
531                })?;
532
533                let mut remount_flags =
534                    MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_NOSUID | MsFlags::MS_NODEV;
535                if recursive {
536                    remount_flags |= MsFlags::MS_REC;
537                }
538                if volume.read_only {
539                    remount_flags |= MsFlags::MS_RDONLY;
540                }
541
542                mount(
543                    None::<&str>,
544                    &dest,
545                    None::<&str>,
546                    remount_flags,
547                    None::<&str>,
548                )
549                .map_err(|e| {
550                    NucleusError::FilesystemError(format!(
551                        "Failed to remount volume {:?} with final flags: {}",
552                        dest, e
553                    ))
554                })?;
555
556                info!(
557                    "Mounted bind volume {:?} -> {:?} ({})",
558                    source,
559                    volume.dest,
560                    if volume.read_only { "ro" } else { "rw" }
561                );
562            }
563            VolumeSource::Tmpfs { size } => {
564                std::fs::create_dir_all(&dest).map_err(|e| {
565                    NucleusError::FilesystemError(format!(
566                        "Failed to create tmpfs mount dir {:?}: {}",
567                        dest, e
568                    ))
569                })?;
570
571                // M8: Validate size parameter to prevent option injection.
572                // Only allow digits, optionally followed by K/M/G suffix.
573                if let Some(value) = size.as_ref() {
574                    let valid = value
575                        .chars()
576                        .all(|c| c.is_ascii_digit() || "kKmMgG".contains(c));
577                    if !valid || value.is_empty() {
578                        return Err(NucleusError::FilesystemError(format!(
579                            "Invalid tmpfs size value '{}': only digits with optional K/M/G suffix allowed",
580                            value
581                        )));
582                    }
583                }
584
585                // M7: Default to 64MB instead of half of physical RAM to
586                // prevent memory DoS from unbounded tmpfs volumes.
587                let mount_data = size
588                    .as_ref()
589                    .map(|value| format!("size={},mode=0700", value))
590                    .unwrap_or_else(|| "size=64M,mode=0700".to_string());
591
592                let mut flags = MsFlags::MS_NOSUID | MsFlags::MS_NODEV;
593                if volume.read_only {
594                    flags |= MsFlags::MS_RDONLY;
595                }
596                mount(
597                    Some("tmpfs"),
598                    &dest,
599                    Some("tmpfs"),
600                    flags,
601                    Some(mount_data.as_str()),
602                )
603                .map_err(|e| {
604                    NucleusError::FilesystemError(format!(
605                        "Failed to mount tmpfs volume at {:?}: {}",
606                        dest, e
607                    ))
608                })?;
609
610                info!(
611                    "Mounted tmpfs volume at {:?}{}{}",
612                    volume.dest,
613                    size.as_ref()
614                        .map(|value| format!(" (size={})", value))
615                        .unwrap_or_default(),
616                    if volume.read_only { " (ro)" } else { "" }
617                );
618            }
619        }
620    }
621
622    Ok(())
623}
624
625/// Mount procfs at the given path
626///
627/// In rootless mode, procfs mounting should work due to user namespace capabilities.
628/// When `hide_pids` is true, mounts with hidepid=2 so processes cannot enumerate
629/// other PIDs (production hardening).
630pub fn mount_procfs(
631    proc_path: &Path,
632    best_effort: bool,
633    read_only: bool,
634    hide_pids: bool,
635) -> Result<()> {
636    info!(
637        "Mounting procfs at {:?} (hidepid={})",
638        proc_path,
639        if hide_pids { "2" } else { "0" }
640    );
641
642    let mount_data: Option<&str> = if hide_pids { Some("hidepid=2") } else { None };
643
644    match mount(
645        Some("proc"),
646        proc_path,
647        Some("proc"),
648        MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
649        mount_data,
650    ) {
651        Ok(_) => {
652            if read_only {
653                mount(
654                    None::<&str>,
655                    proc_path,
656                    None::<&str>,
657                    MsFlags::MS_REMOUNT
658                        | MsFlags::MS_RDONLY
659                        | MsFlags::MS_NOSUID
660                        | MsFlags::MS_NODEV
661                        | MsFlags::MS_NOEXEC,
662                    None::<&str>,
663                )
664                .map_err(|e| {
665                    NucleusError::FilesystemError(format!(
666                        "Failed to remount procfs read-only: {}",
667                        e
668                    ))
669                })?;
670                info!("Successfully mounted procfs (read-only)");
671            } else {
672                info!("Successfully mounted procfs");
673            }
674            Ok(())
675        }
676        Err(e) => {
677            if best_effort {
678                warn!("Failed to mount procfs: {} (continuing anyway)", e);
679                Ok(())
680            } else {
681                Err(NucleusError::FilesystemError(format!(
682                    "Failed to mount procfs: {}",
683                    e
684                )))
685            }
686        }
687    }
688}
689
690/// Paths to mask with /dev/null (files) — matches OCI runtime spec masked paths.
691/// Exposed for testing; the canonical list of sensitive /proc entries that must
692/// be hidden from container processes.
693pub const PROC_NULL_MASKED: &[&str] = &[
694    "kallsyms",
695    "kcore",
696    "sched_debug",
697    "timer_list",
698    "timer_stats",
699    "keys",
700    "latency_stats",
701    "config.gz",
702    "sysrq-trigger",
703    "kpagecount",
704    "kpageflags",
705    "kpagecgroup",
706];
707
708/// Paths to mask with empty tmpfs (directories).
709pub const PROC_TMPFS_MASKED: &[&str] = &["acpi", "bus", "irq", "scsi", "sys"];
710
711/// Mask sensitive /proc paths by bind-mounting /dev/null or tmpfs over them
712///
713/// This reduces kernel information leakage from the container. Follows OCI runtime
714/// conventions for masked paths.
715///
716/// SEC-06: When `production` is true, failures to mask critical paths
717/// (kcore, kallsyms, sysrq-trigger) are fatal instead of warn-and-continue.
718pub fn mask_proc_paths(proc_path: &Path, production: bool) -> Result<()> {
719    info!("Masking sensitive /proc paths");
720
721    const CRITICAL_PROC_PATHS: &[&str] = &["kcore", "kallsyms", "sysrq-trigger"];
722
723    let dev_null = Path::new("/dev/null");
724
725    for name in PROC_NULL_MASKED {
726        let target = proc_path.join(name);
727        if !target.exists() {
728            continue;
729        }
730        match mount(
731            Some(dev_null),
732            &target,
733            None::<&str>,
734            MsFlags::MS_BIND,
735            None::<&str>,
736        ) {
737            Ok(_) => {
738                // Remount read-only: Linux ignores MS_RDONLY on the initial bind mount,
739                // so a separate MS_REMOUNT|MS_BIND|MS_RDONLY call is required.
740                if let Err(e) = mount(
741                    None::<&str>,
742                    &target,
743                    None::<&str>,
744                    MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_RDONLY,
745                    None::<&str>,
746                ) {
747                    if production && CRITICAL_PROC_PATHS.contains(name) {
748                        return Err(NucleusError::FilesystemError(format!(
749                            "Failed to remount /proc/{} read-only in production mode: {}",
750                            name, e
751                        )));
752                    }
753                    warn!(
754                        "Failed to remount /proc/{} read-only: {} (continuing)",
755                        name, e
756                    );
757                }
758                debug!("Masked /proc/{} (read-only)", name);
759            }
760            Err(e) => {
761                if production && CRITICAL_PROC_PATHS.contains(name) {
762                    return Err(NucleusError::FilesystemError(format!(
763                        "Failed to mask critical /proc/{} in production mode: {}",
764                        name, e
765                    )));
766                }
767                warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
768            }
769        }
770    }
771
772    for name in PROC_TMPFS_MASKED {
773        let target = proc_path.join(name);
774        if !target.exists() {
775            continue;
776        }
777        match mount(
778            Some("tmpfs"),
779            &target,
780            Some("tmpfs"),
781            MsFlags::MS_RDONLY | MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
782            Some("size=0"),
783        ) {
784            Ok(_) => debug!("Masked /proc/{}", name),
785            Err(e) => {
786                if production {
787                    return Err(NucleusError::FilesystemError(format!(
788                        "Failed to mask /proc/{} in production mode: {}",
789                        name, e
790                    )));
791                }
792                warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
793            }
794        }
795    }
796
797    info!("Finished masking sensitive /proc paths");
798    Ok(())
799}
800
801/// Switch to new root filesystem using pivot_root or chroot
802///
803/// This implements the transition: populated -> pivoted
804/// Fails closed if root switching cannot be established.
805pub fn switch_root(new_root: &Path, allow_chroot_fallback: bool) -> Result<()> {
806    info!("Switching root to {:?}", new_root);
807
808    match pivot_root_impl(new_root) {
809        Ok(()) => {
810            info!("Successfully switched root using pivot_root");
811            Ok(())
812        }
813        Err(e) => {
814            if allow_chroot_fallback {
815                warn!(
816                    "pivot_root failed ({}), falling back to chroot due to explicit \
817                     configuration",
818                    e
819                );
820                chroot_impl(new_root)
821            } else {
822                Err(NucleusError::PivotRootError(format!(
823                    "pivot_root failed: {}. chroot fallback is disabled by default; use \
824                     --allow-chroot-fallback to allow weaker isolation",
825                    e
826                )))
827            }
828        }
829    }
830}
831
832/// Implement root switch using pivot_root(2)
833///
834/// pivot_root is preferred over chroot because:
835/// - More secure (old root can be unmounted)
836/// - Works better with mount namespaces
837fn pivot_root_impl(new_root: &Path) -> Result<()> {
838    use nix::unistd::pivot_root;
839
840    // pivot_root requires new_root to be a mount point
841    // and old_root to be under new_root
842
843    let old_root = new_root.join(".old_root");
844    std::fs::create_dir_all(&old_root).map_err(|e| {
845        NucleusError::PivotRootError(format!("Failed to create old_root directory: {}", e))
846    })?;
847
848    // Perform pivot_root
849    pivot_root(new_root, &old_root)
850        .map_err(|e| NucleusError::PivotRootError(format!("pivot_root syscall failed: {}", e)))?;
851
852    // Change to new root
853    std::env::set_current_dir("/")
854        .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
855
856    // Unmount old root
857    nix::mount::umount2("/.old_root", nix::mount::MntFlags::MNT_DETACH)
858        .map_err(|e| NucleusError::PivotRootError(format!("Failed to unmount old root: {}", e)))?;
859
860    // Remove old root directory
861    let _ = std::fs::remove_dir("/.old_root");
862
863    Ok(())
864}
865
866/// Implement root switch using chroot(2)
867///
868/// chroot is less secure than pivot_root but works in more situations
869fn chroot_impl(new_root: &Path) -> Result<()> {
870    chroot(new_root)
871        .map_err(|e| NucleusError::PivotRootError(format!("chroot syscall failed: {}", e)))?;
872
873    // Change to new root
874    std::env::set_current_dir("/")
875        .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
876
877    // L3: Drop CAP_SYS_CHROOT after chroot to prevent escape via nested chroot.
878    // Also close any FDs pointing outside the new root.
879    if let Err(e) = caps::drop(
880        None,
881        caps::CapSet::Bounding,
882        caps::Capability::CAP_SYS_CHROOT,
883    ) {
884        debug!(
885            "Could not drop CAP_SYS_CHROOT after chroot: {} (may not be present)",
886            e
887        );
888    }
889    if let Err(e) = caps::drop(
890        None,
891        caps::CapSet::Effective,
892        caps::Capability::CAP_SYS_CHROOT,
893    ) {
894        debug!(
895            "Could not drop effective CAP_SYS_CHROOT: {} (may not be present)",
896            e
897        );
898    }
899    if let Err(e) = caps::drop(
900        None,
901        caps::CapSet::Permitted,
902        caps::Capability::CAP_SYS_CHROOT,
903    ) {
904        debug!(
905            "Could not drop permitted CAP_SYS_CHROOT: {} (may not be present)",
906            e
907        );
908    }
909
910    info!("Successfully switched root using chroot (CAP_SYS_CHROOT dropped)");
911
912    Ok(())
913}
914
915/// Mount secret files into the container root.
916///
917/// Each secret is bind-mounted read-only from its source to the destination
918/// path inside the container. Intermediate directories are created as needed.
919pub fn mount_secrets(root: &Path, secrets: &[crate::container::SecretMount]) -> Result<()> {
920    if secrets.is_empty() {
921        return Ok(());
922    }
923
924    info!("Mounting {} secret(s) into container", secrets.len());
925
926    for secret in secrets {
927        // M1: Use O_PATH|O_NOFOLLOW to get an FD that refuses symlinks,
928        // then fstat to confirm the source is a regular file/dir.
929        // This closes the TOCTOU gap between check and mount.
930        let meta = std::fs::symlink_metadata(&secret.source).map_err(|_| {
931            NucleusError::FilesystemError(format!(
932                "Secret source does not exist: {:?}",
933                secret.source
934            ))
935        })?;
936        if meta.file_type().is_symlink() {
937            return Err(NucleusError::FilesystemError(format!(
938                "Secret source {:?} is a symlink; refusing to mount (TOCTOU mitigation)",
939                secret.source
940            )));
941        }
942
943        // Destination inside container root
944        let dest = resolve_container_destination(root, &secret.dest)?;
945
946        // Create parent directories
947        if let Some(parent) = dest.parent() {
948            std::fs::create_dir_all(parent).map_err(|e| {
949                NucleusError::FilesystemError(format!(
950                    "Failed to create secret mount parent {:?}: {}",
951                    parent, e
952                ))
953            })?;
954        }
955
956        // Create mount point file
957        if secret.source.is_file() {
958            std::fs::write(&dest, "").map_err(|e| {
959                NucleusError::FilesystemError(format!(
960                    "Failed to create secret mount point {:?}: {}",
961                    dest, e
962                ))
963            })?;
964        } else {
965            std::fs::create_dir_all(&dest).map_err(|e| {
966                NucleusError::FilesystemError(format!(
967                    "Failed to create secret mount dir {:?}: {}",
968                    dest, e
969                ))
970            })?;
971        }
972
973        // Bind mount read-only
974        mount(
975            Some(secret.source.as_path()),
976            &dest,
977            None::<&str>,
978            MsFlags::MS_BIND,
979            None::<&str>,
980        )
981        .map_err(|e| {
982            NucleusError::FilesystemError(format!(
983                "Failed to bind mount secret {:?}: {}",
984                secret.source, e
985            ))
986        })?;
987
988        mount(
989            None::<&str>,
990            &dest,
991            None::<&str>,
992            MsFlags::MS_REMOUNT
993                | MsFlags::MS_BIND
994                | MsFlags::MS_RDONLY
995                | MsFlags::MS_NOSUID
996                | MsFlags::MS_NODEV
997                | MsFlags::MS_NOEXEC,
998            None::<&str>,
999        )
1000        .map_err(|e| {
1001            NucleusError::FilesystemError(format!(
1002                "Failed to remount secret {:?} read-only: {}",
1003                dest, e
1004            ))
1005        })?;
1006
1007        // Apply configured file permissions on the mount point
1008        if secret.source.is_file() {
1009            use std::os::unix::fs::PermissionsExt;
1010            let perms = std::fs::Permissions::from_mode(secret.mode);
1011            if let Err(e) = std::fs::set_permissions(&dest, perms) {
1012                warn!(
1013                    "Failed to set mode {:04o} on secret {:?}: {} (bind mount may override)",
1014                    secret.mode, dest, e
1015                );
1016            }
1017        }
1018
1019        debug!(
1020            "Mounted secret {:?} -> {:?} (mode {:04o})",
1021            secret.source, secret.dest, secret.mode
1022        );
1023    }
1024
1025    Ok(())
1026}
1027
1028/// Mount secrets onto a dedicated in-memory tmpfs instead of bind-mounting host paths.
1029///
1030/// Creates a per-container tmpfs at `<root>/run/secrets` with MS_NOEXEC | MS_NOSUID | MS_NODEV,
1031/// copies secret contents into it, then zeros the read buffer. This ensures secrets
1032/// never reference host-side files after setup and are never persisted to disk.
1033pub fn mount_secrets_inmemory(
1034    root: &Path,
1035    secrets: &[crate::container::SecretMount],
1036    identity: &crate::container::ProcessIdentity,
1037) -> Result<()> {
1038    if secrets.is_empty() {
1039        return Ok(());
1040    }
1041
1042    info!("Mounting {} secret(s) on in-memory tmpfs", secrets.len());
1043
1044    let secrets_dir = root.join("run/secrets");
1045    std::fs::create_dir_all(&secrets_dir).map_err(|e| {
1046        NucleusError::FilesystemError(format!(
1047            "Failed to create secrets dir {:?}: {}",
1048            secrets_dir, e
1049        ))
1050    })?;
1051
1052    // Mount a size-limited tmpfs for secrets (16 MiB max)
1053    if let Err(e) = mount(
1054        Some("tmpfs"),
1055        &secrets_dir,
1056        Some("tmpfs"),
1057        MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
1058        Some("size=16m,mode=0700"),
1059    ) {
1060        let _ = std::fs::remove_dir_all(&secrets_dir);
1061        return Err(NucleusError::FilesystemError(format!(
1062            "Failed to mount secrets tmpfs at {:?}: {}",
1063            secrets_dir, e
1064        )));
1065    }
1066
1067    if !identity.is_root() {
1068        nix::unistd::chown(
1069            &secrets_dir,
1070            Some(nix::unistd::Uid::from_raw(identity.uid)),
1071            Some(nix::unistd::Gid::from_raw(identity.gid)),
1072        )
1073        .map_err(|e| {
1074            let _ = nix::mount::umount2(&secrets_dir, nix::mount::MntFlags::MNT_DETACH);
1075            let _ = std::fs::remove_dir_all(&secrets_dir);
1076            NucleusError::FilesystemError(format!(
1077                "Failed to set /run/secrets owner to {}:{}: {}",
1078                identity.uid, identity.gid, e
1079            ))
1080        })?;
1081    }
1082
1083    // Rollback: unmount tmpfs and remove dir if any secret fails
1084    let result = mount_secrets_inmemory_inner(&secrets_dir, root, secrets, identity);
1085    if let Err(ref e) = result {
1086        let _ = nix::mount::umount2(&secrets_dir, nix::mount::MntFlags::MNT_DETACH);
1087        let _ = std::fs::remove_dir_all(&secrets_dir);
1088        return Err(NucleusError::FilesystemError(format!(
1089            "Secret mount failed (rolled back): {}",
1090            e
1091        )));
1092    }
1093
1094    info!("All secrets mounted on in-memory tmpfs");
1095    Ok(())
1096}
1097
1098fn mount_secrets_inmemory_inner(
1099    secrets_dir: &Path,
1100    root: &Path,
1101    secrets: &[crate::container::SecretMount],
1102    identity: &crate::container::ProcessIdentity,
1103) -> Result<()> {
1104    for secret in secrets {
1105        // Use symlink_metadata (lstat) to check existence without following
1106        // symlinks, preventing TOCTOU via symlink swap before the read.
1107        if std::fs::symlink_metadata(&secret.source).is_err() {
1108            return Err(NucleusError::FilesystemError(format!(
1109                "Secret source does not exist: {:?}",
1110                secret.source
1111            )));
1112        }
1113
1114        // Read secret content from host
1115        let mut content = std::fs::read(&secret.source).map_err(|e| {
1116            NucleusError::FilesystemError(format!(
1117                "Failed to read secret {:?}: {}",
1118                secret.source, e
1119            ))
1120        })?;
1121
1122        // Determine destination path inside the secrets tmpfs
1123        let dest = resolve_container_destination(secrets_dir, &secret.dest)?;
1124
1125        // Create parent directories within the tmpfs
1126        if let Some(parent) = dest.parent() {
1127            std::fs::create_dir_all(parent).map_err(|e| {
1128                NucleusError::FilesystemError(format!(
1129                    "Failed to create secret parent dir {:?}: {}",
1130                    parent, e
1131                ))
1132            })?;
1133        }
1134
1135        // Write secret content to tmpfs
1136        std::fs::write(&dest, &content).map_err(|e| {
1137            NucleusError::FilesystemError(format!("Failed to write secret to {:?}: {}", dest, e))
1138        })?;
1139
1140        // Set permissions
1141        {
1142            use std::os::unix::fs::PermissionsExt;
1143            let perms = std::fs::Permissions::from_mode(secret.mode);
1144            std::fs::set_permissions(&dest, perms).map_err(|e| {
1145                NucleusError::FilesystemError(format!(
1146                    "Failed to set permissions on secret {:?}: {}",
1147                    dest, e
1148                ))
1149            })?;
1150        }
1151
1152        if !identity.is_root() {
1153            nix::unistd::chown(
1154                &dest,
1155                Some(nix::unistd::Uid::from_raw(identity.uid)),
1156                Some(nix::unistd::Gid::from_raw(identity.gid)),
1157            )
1158            .map_err(|e| {
1159                NucleusError::FilesystemError(format!(
1160                    "Failed to set permissions owner on secret {:?} to {}:{}: {}",
1161                    dest, identity.uid, identity.gid, e
1162                ))
1163            })?;
1164        }
1165
1166        // Zero the in-memory buffer
1167        zeroize::Zeroize::zeroize(&mut content);
1168        drop(content);
1169
1170        // Also bind-mount the secret to its expected container path for compatibility
1171        let container_dest = resolve_container_destination(root, &secret.dest)?;
1172        if container_dest != dest {
1173            if let Some(parent) = container_dest.parent() {
1174                std::fs::create_dir_all(parent).map_err(|e| {
1175                    NucleusError::FilesystemError(format!(
1176                        "Failed to create secret mount parent {:?}: {}",
1177                        parent, e
1178                    ))
1179                })?;
1180            }
1181
1182            if secret.source.is_file() {
1183                std::fs::write(&container_dest, "").map_err(|e| {
1184                    NucleusError::FilesystemError(format!(
1185                        "Failed to create secret mount point {:?}: {}",
1186                        container_dest, e
1187                    ))
1188                })?;
1189            }
1190
1191            mount(
1192                Some(dest.as_path()),
1193                &container_dest,
1194                None::<&str>,
1195                MsFlags::MS_BIND,
1196                None::<&str>,
1197            )
1198            .map_err(|e| {
1199                NucleusError::FilesystemError(format!(
1200                    "Failed to bind mount secret {:?} -> {:?}: {}",
1201                    dest, container_dest, e
1202                ))
1203            })?;
1204
1205            mount(
1206                None::<&str>,
1207                &container_dest,
1208                None::<&str>,
1209                MsFlags::MS_REMOUNT
1210                    | MsFlags::MS_BIND
1211                    | MsFlags::MS_RDONLY
1212                    | MsFlags::MS_NOSUID
1213                    | MsFlags::MS_NODEV
1214                    | MsFlags::MS_NOEXEC,
1215                None::<&str>,
1216            )
1217            .map_err(|e| {
1218                NucleusError::FilesystemError(format!(
1219                    "Failed to remount secret {:?} read-only: {}",
1220                    container_dest, e
1221                ))
1222            })?;
1223        }
1224
1225        debug!(
1226            "Secret {:?} -> {:?} (in-memory tmpfs, mode {:04o})",
1227            secret.source, secret.dest, secret.mode
1228        );
1229    }
1230
1231    Ok(())
1232}
1233
1234#[cfg(test)]
1235mod tests {
1236    use super::*;
1237
1238    #[test]
1239    fn test_proc_mask_includes_sysrq_trigger() {
1240        assert!(
1241            PROC_NULL_MASKED.contains(&"sysrq-trigger"),
1242            "/proc/sysrq-trigger must be masked to prevent host DoS"
1243        );
1244    }
1245
1246    #[test]
1247    fn test_proc_mask_includes_timer_stats() {
1248        assert!(
1249            PROC_NULL_MASKED.contains(&"timer_stats"),
1250            "/proc/timer_stats must be masked to prevent kernel info leakage"
1251        );
1252    }
1253
1254    #[test]
1255    fn test_proc_mask_includes_kpage_files() {
1256        for path in &["kpagecount", "kpageflags", "kpagecgroup"] {
1257            assert!(
1258                PROC_NULL_MASKED.contains(path),
1259                "/proc/{} must be masked to prevent host memory layout leakage",
1260                path
1261            );
1262        }
1263    }
1264
1265    #[test]
1266    fn test_proc_mask_includes_oci_standard_paths() {
1267        // OCI runtime spec required masked paths
1268        for path in &["kallsyms", "kcore", "sched_debug", "keys", "config.gz"] {
1269            assert!(
1270                PROC_NULL_MASKED.contains(path),
1271                "/proc/{} must be in null-masked list (OCI spec)",
1272                path
1273            );
1274        }
1275        for path in &["acpi", "bus", "scsi", "sys"] {
1276            assert!(
1277                PROC_TMPFS_MASKED.contains(path),
1278                "/proc/{} must be in tmpfs-masked list (OCI spec)",
1279                path
1280            );
1281        }
1282    }
1283}