Skip to main content

nucleus/filesystem/
mount.rs

1use crate::error::{NucleusError, Result};
2use nix::mount::{mount, MsFlags};
3use nix::sys::stat::{makedev, mknod, Mode, SFlag};
4use nix::unistd::chroot;
5use std::path::{Component, Path, PathBuf};
6use tracing::{debug, info, warn};
7
8/// Expected mount flags for audit verification.
9struct ExpectedMount {
10    path: &'static str,
11    required_flags: &'static [&'static str],
12    /// If true, the mount *must* exist in production mode. A missing critical
13    /// mount (e.g. /proc) is treated as a violation rather than silently skipped.
14    critical: bool,
15}
16
17/// Known mount paths and the flags they must carry in production mode.
18const PRODUCTION_MOUNT_EXPECTATIONS: &[ExpectedMount] = &[
19    ExpectedMount {
20        path: "/bin",
21        required_flags: &["ro", "nosuid", "nodev"],
22        critical: true,
23    },
24    ExpectedMount {
25        path: "/usr",
26        required_flags: &["ro", "nosuid", "nodev"],
27        critical: true,
28    },
29    ExpectedMount {
30        path: "/lib",
31        required_flags: &["ro", "nosuid", "nodev"],
32        critical: false, // not all rootfs layouts have /lib
33    },
34    ExpectedMount {
35        path: "/lib64",
36        required_flags: &["ro", "nosuid", "nodev"],
37        critical: false, // not all rootfs layouts have /lib64
38    },
39    ExpectedMount {
40        path: "/etc",
41        required_flags: &["ro", "nosuid", "nodev"],
42        critical: true,
43    },
44    ExpectedMount {
45        path: "/nix",
46        required_flags: &["ro", "nosuid", "nodev"],
47        critical: false, // only present on NixOS-based rootfs
48    },
49    ExpectedMount {
50        path: "/sbin",
51        required_flags: &["ro", "nosuid", "nodev"],
52        critical: false, // not all rootfs layouts have /sbin
53    },
54    ExpectedMount {
55        path: "/proc",
56        required_flags: &["nosuid", "nodev", "noexec"],
57        critical: true,
58    },
59    ExpectedMount {
60        path: "/run/secrets",
61        required_flags: &["nosuid", "nodev", "noexec"],
62        critical: false, // only present when secrets are configured
63    },
64];
65
66/// Normalize an absolute container destination path and reject traversal.
67///
68/// Returns a normalized absolute path containing only `RootDir` and `Normal`
69/// components. `.` segments are ignored; `..` and relative paths are rejected.
70pub fn normalize_container_destination(dest: &Path) -> Result<PathBuf> {
71    if !dest.is_absolute() {
72        return Err(NucleusError::ConfigError(format!(
73            "Container destination must be absolute: {:?}",
74            dest
75        )));
76    }
77
78    let mut normalized = PathBuf::from("/");
79    let mut saw_component = false;
80
81    for component in dest.components() {
82        match component {
83            Component::RootDir => {}
84            Component::CurDir => {}
85            Component::Normal(part) => {
86                normalized.push(part);
87                saw_component = true;
88            }
89            Component::ParentDir => {
90                return Err(NucleusError::ConfigError(format!(
91                    "Container destination must not contain parent traversal: {:?}",
92                    dest
93                )));
94            }
95            Component::Prefix(_) => {
96                return Err(NucleusError::ConfigError(format!(
97                    "Unsupported container destination prefix: {:?}",
98                    dest
99                )));
100            }
101        }
102    }
103
104    if !saw_component {
105        return Err(NucleusError::ConfigError(format!(
106            "Container destination must not be the root directory: {:?}",
107            dest
108        )));
109    }
110
111    Ok(normalized)
112}
113
114/// Resolve a validated container destination under a host-side root directory.
115pub fn resolve_container_destination(root: &Path, dest: &Path) -> Result<PathBuf> {
116    let normalized = normalize_container_destination(dest)?;
117    let relative = normalized
118        .strip_prefix("/")
119        .expect("normalized container destination is always absolute");
120    Ok(root.join(relative))
121}
122
123/// Audit all mounts in the container's mount namespace.
124///
125/// Reads /proc/self/mounts and verifies that each known mount point carries
126/// its expected flags. In production mode, any missing flag is fatal.
127/// Returns Ok(()) if all checks pass, or a list of violations.
128pub fn audit_mounts(production_mode: bool) -> Result<()> {
129    let mounts_content = std::fs::read_to_string("/proc/self/mounts").map_err(|e| {
130        NucleusError::FilesystemError(format!("Failed to read /proc/self/mounts: {}", e))
131    })?;
132
133    let mut violations = Vec::new();
134
135    for expectation in PRODUCTION_MOUNT_EXPECTATIONS {
136        // Find the mount entry for this path
137        let mount_entry = mounts_content.lines().find(|line| {
138            let parts: Vec<&str> = line.split_whitespace().collect();
139            parts.len() >= 4 && parts[1] == expectation.path
140        });
141
142        if let Some(entry) = mount_entry {
143            let parts: Vec<&str> = entry.split_whitespace().collect();
144            if parts.len() >= 4 {
145                let options = parts[3];
146                for &flag in expectation.required_flags {
147                    if !options.split(',').any(|opt| opt == flag) {
148                        violations.push(format!(
149                            "Mount {} missing required flag '{}' (has: {})",
150                            expectation.path, flag, options
151                        ));
152                    }
153                }
154            }
155        } else if expectation.critical && production_mode {
156            violations.push(format!(
157                "Critical mount {} is missing from the mount namespace",
158                expectation.path
159            ));
160        }
161    }
162
163    if violations.is_empty() {
164        info!("Mount audit passed: all expected flags verified");
165        Ok(())
166    } else if production_mode {
167        Err(NucleusError::FilesystemError(format!(
168            "Mount audit failed in production mode:\n  {}",
169            violations.join("\n  ")
170        )))
171    } else {
172        for v in &violations {
173            warn!("Mount audit: {}", v);
174        }
175        Ok(())
176    }
177}
178
179/// Create minimal filesystem structure in the new root
180pub fn create_minimal_fs(root: &Path) -> Result<()> {
181    info!("Creating minimal filesystem structure at {:?}", root);
182
183    // Create essential directories
184    let dirs = vec![
185        "dev",
186        "proc",
187        "sys",
188        "tmp",
189        "bin",
190        "sbin",
191        "usr",
192        "lib",
193        "lib64",
194        "etc",
195        "nix",
196        "nix/store",
197        "run",
198        "context",
199    ];
200
201    for dir in dirs {
202        let path = root.join(dir);
203        std::fs::create_dir_all(&path).map_err(|e| {
204            NucleusError::FilesystemError(format!("Failed to create directory {:?}: {}", path, e))
205        })?;
206    }
207
208    info!("Created minimal filesystem structure");
209
210    Ok(())
211}
212
213/// Create essential device nodes in /dev
214///
215/// In rootless mode, device node creation will fail gracefully
216pub fn create_dev_nodes(dev_path: &Path, include_tty: bool) -> Result<()> {
217    info!("Creating device nodes at {:?}", dev_path);
218
219    // Device nodes: (name, type, major, minor)
220    let mut devices = vec![
221        ("null", SFlag::S_IFCHR, 1, 3),
222        ("zero", SFlag::S_IFCHR, 1, 5),
223        ("full", SFlag::S_IFCHR, 1, 7),
224        ("random", SFlag::S_IFCHR, 1, 8),
225        ("urandom", SFlag::S_IFCHR, 1, 9),
226    ];
227    if include_tty {
228        devices.push(("tty", SFlag::S_IFCHR, 5, 0));
229    }
230
231    let mut created_count = 0;
232    let mut failed_count = 0;
233
234    for (name, dev_type, major, minor) in devices {
235        let path = dev_path.join(name);
236        let mode = Mode::from_bits_truncate(0o666);
237        let dev = makedev(major, minor);
238
239        match mknod(&path, dev_type, mode, dev) {
240            Ok(_) => {
241                info!("Created device node: {:?}", path);
242                created_count += 1;
243            }
244            Err(e) => {
245                // In rootless mode, mknod fails - this is expected
246                warn!(
247                    "Failed to create device node {:?}: {} (this is normal in rootless mode)",
248                    path, e
249                );
250                failed_count += 1;
251            }
252        }
253    }
254
255    if created_count > 0 {
256        info!("Successfully created {} device nodes", created_count);
257    }
258    if failed_count > 0 {
259        info!("Skipped {} device nodes (rootless mode)", failed_count);
260    }
261
262    Ok(())
263}
264
265/// Bind mount a pre-built rootfs (e.g. a Nix store closure) into the container.
266///
267/// Instead of exposing the full host /bin, /usr, /lib, /lib64, /nix, this mounts
268/// a minimal, purpose-built root filesystem. Suitable for production services.
269pub fn bind_mount_rootfs(root: &Path, rootfs_path: &Path) -> Result<()> {
270    info!(
271        "Bind mounting production rootfs {:?} into container {:?}",
272        rootfs_path, root
273    );
274
275    if !rootfs_path.exists() {
276        return Err(NucleusError::FilesystemError(format!(
277            "Rootfs path does not exist: {:?}",
278            rootfs_path
279        )));
280    }
281
282    // Bind mount the rootfs contents into the container root.
283    // The rootfs is expected to contain a standard FHS layout (/bin, /lib, /etc, etc.)
284    // produced by a Nix buildEnv or similar.
285    let subdirs = ["bin", "sbin", "lib", "lib64", "usr", "etc", "nix"];
286
287    for subdir in &subdirs {
288        let source = rootfs_path.join(subdir);
289        if !source.exists() {
290            debug!("Rootfs subdir {} not present, skipping", subdir);
291            continue;
292        }
293
294        let target = root.join(subdir);
295        std::fs::create_dir_all(&target).map_err(|e| {
296            NucleusError::FilesystemError(format!(
297                "Failed to create mount point {:?}: {}",
298                target, e
299            ))
300        })?;
301
302        mount(
303            Some(&source),
304            &target,
305            None::<&str>,
306            MsFlags::MS_BIND | MsFlags::MS_REC,
307            None::<&str>,
308        )
309        .map_err(|e| {
310            NucleusError::FilesystemError(format!(
311                "Failed to bind mount rootfs {:?} -> {:?}: {}",
312                source, target, e
313            ))
314        })?;
315
316        // Remount read-only
317        mount(
318            None::<&str>,
319            &target,
320            None::<&str>,
321            MsFlags::MS_REMOUNT
322                | MsFlags::MS_BIND
323                | MsFlags::MS_RDONLY
324                | MsFlags::MS_REC
325                | MsFlags::MS_NOSUID
326                | MsFlags::MS_NODEV,
327            None::<&str>,
328        )
329        .map_err(|e| {
330            NucleusError::FilesystemError(format!(
331                "Failed to remount rootfs {:?} read-only: {}",
332                target, e
333            ))
334        })?;
335
336        info!("Mounted rootfs/{} read-only", subdir);
337    }
338
339    Ok(())
340}
341
342/// Bind mount essential host directories into container
343///
344/// This allows host binaries to be accessible inside the container.
345/// Used in agent mode. Production mode should use bind_mount_rootfs() instead.
346pub fn bind_mount_host_paths(root: &Path, best_effort: bool) -> Result<()> {
347    info!("Bind mounting host paths into container");
348
349    // Essential paths to bind mount (read-only)
350    let host_paths = vec![
351        "/bin", "/usr", "/lib", "/lib64", "/nix", // For NixOS
352    ];
353
354    for host_path in host_paths {
355        let host = Path::new(host_path);
356
357        // Only mount if the path exists on the host
358        if !host.exists() {
359            debug!("Skipping {} (not present on host)", host_path);
360            continue;
361        }
362
363        let container_path = root.join(host_path.trim_start_matches('/'));
364
365        // Create mount point
366        if let Err(e) = std::fs::create_dir_all(&container_path) {
367            if best_effort {
368                warn!("Failed to create mount point {:?}: {}", container_path, e);
369                continue;
370            }
371            return Err(NucleusError::FilesystemError(format!(
372                "Failed to create mount point {:?}: {}",
373                container_path, e
374            )));
375        }
376
377        // Attempt bind mount
378        // Note: Linux ignores MS_RDONLY on the initial bind mount call.
379        // A second remount is required to actually enforce read-only.
380        match mount(
381            Some(host),
382            &container_path,
383            None::<&str>,
384            MsFlags::MS_BIND | MsFlags::MS_REC,
385            None::<&str>,
386        ) {
387            Ok(_) => {
388                // Remount as read-only – required because MS_RDONLY is ignored on initial bind
389                mount(
390                    None::<&str>,
391                    &container_path,
392                    None::<&str>,
393                    MsFlags::MS_REMOUNT
394                        | MsFlags::MS_BIND
395                        | MsFlags::MS_RDONLY
396                        | MsFlags::MS_REC
397                        | MsFlags::MS_NOSUID
398                        | MsFlags::MS_NODEV,
399                    None::<&str>,
400                )
401                .map_err(|e| {
402                    NucleusError::FilesystemError(format!(
403                        "Failed to remount {} as read-only: {}",
404                        host_path, e
405                    ))
406                })?;
407                info!(
408                    "Bind mounted {} to {:?} (read-only)",
409                    host_path, container_path
410                );
411            }
412            Err(e) => {
413                if best_effort {
414                    warn!(
415                        "Failed to bind mount {}: {} (continuing anyway)",
416                        host_path, e
417                    );
418                } else {
419                    return Err(NucleusError::FilesystemError(format!(
420                        "Failed to bind mount {}: {}",
421                        host_path, e
422                    )));
423                }
424            }
425        }
426    }
427
428    Ok(())
429}
430
431/// Mount procfs at the given path
432///
433/// In rootless mode, procfs mounting should work due to user namespace capabilities.
434/// When `hide_pids` is true, mounts with hidepid=2 so processes cannot enumerate
435/// other PIDs (production hardening).
436pub fn mount_procfs(
437    proc_path: &Path,
438    best_effort: bool,
439    read_only: bool,
440    hide_pids: bool,
441) -> Result<()> {
442    info!(
443        "Mounting procfs at {:?} (hidepid={})",
444        proc_path,
445        if hide_pids { "2" } else { "0" }
446    );
447
448    let mount_data: Option<&str> = if hide_pids { Some("hidepid=2") } else { None };
449
450    match mount(
451        Some("proc"),
452        proc_path,
453        Some("proc"),
454        MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
455        mount_data,
456    ) {
457        Ok(_) => {
458            if read_only {
459                mount(
460                    None::<&str>,
461                    proc_path,
462                    None::<&str>,
463                    MsFlags::MS_REMOUNT
464                        | MsFlags::MS_RDONLY
465                        | MsFlags::MS_NOSUID
466                        | MsFlags::MS_NODEV
467                        | MsFlags::MS_NOEXEC,
468                    None::<&str>,
469                )
470                .map_err(|e| {
471                    NucleusError::FilesystemError(format!(
472                        "Failed to remount procfs read-only: {}",
473                        e
474                    ))
475                })?;
476                info!("Successfully mounted procfs (read-only)");
477            } else {
478                info!("Successfully mounted procfs");
479            }
480            Ok(())
481        }
482        Err(e) => {
483            if best_effort {
484                warn!("Failed to mount procfs: {} (continuing anyway)", e);
485                Ok(())
486            } else {
487                Err(NucleusError::FilesystemError(format!(
488                    "Failed to mount procfs: {}",
489                    e
490                )))
491            }
492        }
493    }
494}
495
496/// Paths to mask with /dev/null (files) — matches OCI runtime spec masked paths.
497/// Exposed for testing; the canonical list of sensitive /proc entries that must
498/// be hidden from container processes.
499pub const PROC_NULL_MASKED: &[&str] = &[
500    "kallsyms",
501    "kcore",
502    "sched_debug",
503    "timer_list",
504    "timer_stats",
505    "keys",
506    "latency_stats",
507    "config.gz",
508    "sysrq-trigger",
509    "kpagecount",
510    "kpageflags",
511    "kpagecgroup",
512];
513
514/// Paths to mask with empty tmpfs (directories).
515pub const PROC_TMPFS_MASKED: &[&str] = &["acpi", "bus", "irq", "scsi", "sys"];
516
517/// Mask sensitive /proc paths by bind-mounting /dev/null or tmpfs over them
518///
519/// This reduces kernel information leakage from the container. Follows OCI runtime
520/// conventions for masked paths.
521///
522/// SEC-06: When `production` is true, failures to mask critical paths
523/// (kcore, kallsyms, sysrq-trigger) are fatal instead of warn-and-continue.
524pub fn mask_proc_paths(proc_path: &Path, production: bool) -> Result<()> {
525    info!("Masking sensitive /proc paths");
526
527    const CRITICAL_PROC_PATHS: &[&str] = &["kcore", "kallsyms", "sysrq-trigger"];
528
529    let dev_null = Path::new("/dev/null");
530
531    for name in PROC_NULL_MASKED {
532        let target = proc_path.join(name);
533        if !target.exists() {
534            continue;
535        }
536        match mount(
537            Some(dev_null),
538            &target,
539            None::<&str>,
540            MsFlags::MS_BIND,
541            None::<&str>,
542        ) {
543            Ok(_) => {
544                // Remount read-only: Linux ignores MS_RDONLY on the initial bind mount,
545                // so a separate MS_REMOUNT|MS_BIND|MS_RDONLY call is required.
546                if let Err(e) = mount(
547                    None::<&str>,
548                    &target,
549                    None::<&str>,
550                    MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_RDONLY,
551                    None::<&str>,
552                ) {
553                    if production && CRITICAL_PROC_PATHS.contains(name) {
554                        return Err(NucleusError::FilesystemError(format!(
555                            "Failed to remount /proc/{} read-only in production mode: {}",
556                            name, e
557                        )));
558                    }
559                    warn!(
560                        "Failed to remount /proc/{} read-only: {} (continuing)",
561                        name, e
562                    );
563                }
564                debug!("Masked /proc/{} (read-only)", name);
565            }
566            Err(e) => {
567                if production && CRITICAL_PROC_PATHS.contains(name) {
568                    return Err(NucleusError::FilesystemError(format!(
569                        "Failed to mask critical /proc/{} in production mode: {}",
570                        name, e
571                    )));
572                }
573                warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
574            }
575        }
576    }
577
578    for name in PROC_TMPFS_MASKED {
579        let target = proc_path.join(name);
580        if !target.exists() {
581            continue;
582        }
583        match mount(
584            Some("tmpfs"),
585            &target,
586            Some("tmpfs"),
587            MsFlags::MS_RDONLY | MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
588            Some("size=0"),
589        ) {
590            Ok(_) => debug!("Masked /proc/{}", name),
591            Err(e) => {
592                if production {
593                    return Err(NucleusError::FilesystemError(format!(
594                        "Failed to mask /proc/{} in production mode: {}",
595                        name, e
596                    )));
597                }
598                warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
599            }
600        }
601    }
602
603    info!("Finished masking sensitive /proc paths");
604    Ok(())
605}
606
607/// Switch to new root filesystem using pivot_root or chroot
608///
609/// This implements the transition: populated -> pivoted
610/// Fails closed if root switching cannot be established.
611pub fn switch_root(new_root: &Path, allow_chroot_fallback: bool) -> Result<()> {
612    info!("Switching root to {:?}", new_root);
613
614    match pivot_root_impl(new_root) {
615        Ok(()) => {
616            info!("Successfully switched root using pivot_root");
617            Ok(())
618        }
619        Err(e) => {
620            if allow_chroot_fallback {
621                warn!(
622                    "pivot_root failed ({}), falling back to chroot due to explicit \
623                     configuration",
624                    e
625                );
626                chroot_impl(new_root)
627            } else {
628                Err(NucleusError::PivotRootError(format!(
629                    "pivot_root failed: {}. chroot fallback is disabled by default; use \
630                     --allow-chroot-fallback to allow weaker isolation",
631                    e
632                )))
633            }
634        }
635    }
636}
637
638/// Implement root switch using pivot_root(2)
639///
640/// pivot_root is preferred over chroot because:
641/// - More secure (old root can be unmounted)
642/// - Works better with mount namespaces
643fn pivot_root_impl(new_root: &Path) -> Result<()> {
644    use nix::unistd::pivot_root;
645
646    // pivot_root requires new_root to be a mount point
647    // and old_root to be under new_root
648
649    let old_root = new_root.join(".old_root");
650    std::fs::create_dir_all(&old_root).map_err(|e| {
651        NucleusError::PivotRootError(format!("Failed to create old_root directory: {}", e))
652    })?;
653
654    // Perform pivot_root
655    pivot_root(new_root, &old_root)
656        .map_err(|e| NucleusError::PivotRootError(format!("pivot_root syscall failed: {}", e)))?;
657
658    // Change to new root
659    std::env::set_current_dir("/")
660        .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
661
662    // Unmount old root
663    nix::mount::umount2("/.old_root", nix::mount::MntFlags::MNT_DETACH)
664        .map_err(|e| NucleusError::PivotRootError(format!("Failed to unmount old root: {}", e)))?;
665
666    // Remove old root directory
667    let _ = std::fs::remove_dir("/.old_root");
668
669    Ok(())
670}
671
672/// Implement root switch using chroot(2)
673///
674/// chroot is less secure than pivot_root but works in more situations
675fn chroot_impl(new_root: &Path) -> Result<()> {
676    chroot(new_root)
677        .map_err(|e| NucleusError::PivotRootError(format!("chroot syscall failed: {}", e)))?;
678
679    // Change to new root
680    std::env::set_current_dir("/")
681        .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
682
683    info!("Successfully switched root using chroot");
684
685    Ok(())
686}
687
688/// Mount secret files into the container root.
689///
690/// Each secret is bind-mounted read-only from its source to the destination
691/// path inside the container. Intermediate directories are created as needed.
692pub fn mount_secrets(root: &Path, secrets: &[crate::container::SecretMount]) -> Result<()> {
693    if secrets.is_empty() {
694        return Ok(());
695    }
696
697    info!("Mounting {} secret(s) into container", secrets.len());
698
699    for secret in secrets {
700        if !secret.source.exists() {
701            return Err(NucleusError::FilesystemError(format!(
702                "Secret source does not exist: {:?}",
703                secret.source
704            )));
705        }
706
707        // Destination inside container root
708        let dest = resolve_container_destination(root, &secret.dest)?;
709
710        // Create parent directories
711        if let Some(parent) = dest.parent() {
712            std::fs::create_dir_all(parent).map_err(|e| {
713                NucleusError::FilesystemError(format!(
714                    "Failed to create secret mount parent {:?}: {}",
715                    parent, e
716                ))
717            })?;
718        }
719
720        // Create mount point file
721        if secret.source.is_file() {
722            std::fs::write(&dest, "").map_err(|e| {
723                NucleusError::FilesystemError(format!(
724                    "Failed to create secret mount point {:?}: {}",
725                    dest, e
726                ))
727            })?;
728        } else {
729            std::fs::create_dir_all(&dest).map_err(|e| {
730                NucleusError::FilesystemError(format!(
731                    "Failed to create secret mount dir {:?}: {}",
732                    dest, e
733                ))
734            })?;
735        }
736
737        // Bind mount read-only
738        mount(
739            Some(secret.source.as_path()),
740            &dest,
741            None::<&str>,
742            MsFlags::MS_BIND,
743            None::<&str>,
744        )
745        .map_err(|e| {
746            NucleusError::FilesystemError(format!(
747                "Failed to bind mount secret {:?}: {}",
748                secret.source, e
749            ))
750        })?;
751
752        mount(
753            None::<&str>,
754            &dest,
755            None::<&str>,
756            MsFlags::MS_REMOUNT
757                | MsFlags::MS_BIND
758                | MsFlags::MS_RDONLY
759                | MsFlags::MS_NOSUID
760                | MsFlags::MS_NODEV
761                | MsFlags::MS_NOEXEC,
762            None::<&str>,
763        )
764        .map_err(|e| {
765            NucleusError::FilesystemError(format!(
766                "Failed to remount secret {:?} read-only: {}",
767                dest, e
768            ))
769        })?;
770
771        // Apply configured file permissions on the mount point
772        if secret.source.is_file() {
773            use std::os::unix::fs::PermissionsExt;
774            let perms = std::fs::Permissions::from_mode(secret.mode);
775            if let Err(e) = std::fs::set_permissions(&dest, perms) {
776                warn!(
777                    "Failed to set mode {:04o} on secret {:?}: {} (bind mount may override)",
778                    secret.mode, dest, e
779                );
780            }
781        }
782
783        debug!(
784            "Mounted secret {:?} -> {:?} (mode {:04o})",
785            secret.source, secret.dest, secret.mode
786        );
787    }
788
789    Ok(())
790}
791
792/// Mount secrets onto a dedicated in-memory tmpfs instead of bind-mounting host paths.
793///
794/// Creates a per-container tmpfs at `<root>/run/secrets` with MS_NOEXEC | MS_NOSUID | MS_NODEV,
795/// copies secret contents into it, then zeros the read buffer. This ensures secrets
796/// never reference host-side files after setup and are never persisted to disk.
797pub fn mount_secrets_inmemory(
798    root: &Path,
799    secrets: &[crate::container::SecretMount],
800) -> Result<()> {
801    if secrets.is_empty() {
802        return Ok(());
803    }
804
805    info!("Mounting {} secret(s) on in-memory tmpfs", secrets.len());
806
807    let secrets_dir = root.join("run/secrets");
808    std::fs::create_dir_all(&secrets_dir).map_err(|e| {
809        NucleusError::FilesystemError(format!(
810            "Failed to create secrets dir {:?}: {}",
811            secrets_dir, e
812        ))
813    })?;
814
815    // Mount a size-limited tmpfs for secrets (16 MiB max)
816    if let Err(e) = mount(
817        Some("tmpfs"),
818        &secrets_dir,
819        Some("tmpfs"),
820        MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
821        Some("size=16m,mode=0700"),
822    ) {
823        let _ = std::fs::remove_dir_all(&secrets_dir);
824        return Err(NucleusError::FilesystemError(format!(
825            "Failed to mount secrets tmpfs at {:?}: {}",
826            secrets_dir, e
827        )));
828    }
829
830    // Rollback: unmount tmpfs and remove dir if any secret fails
831    let result = mount_secrets_inmemory_inner(&secrets_dir, root, secrets);
832    if let Err(ref e) = result {
833        let _ = nix::mount::umount2(&secrets_dir, nix::mount::MntFlags::MNT_DETACH);
834        let _ = std::fs::remove_dir_all(&secrets_dir);
835        return Err(NucleusError::FilesystemError(format!(
836            "Secret mount failed (rolled back): {}",
837            e
838        )));
839    }
840
841    info!("All secrets mounted on in-memory tmpfs");
842    Ok(())
843}
844
845fn mount_secrets_inmemory_inner(
846    secrets_dir: &Path,
847    root: &Path,
848    secrets: &[crate::container::SecretMount],
849) -> Result<()> {
850    for secret in secrets {
851        if !secret.source.exists() {
852            return Err(NucleusError::FilesystemError(format!(
853                "Secret source does not exist: {:?}",
854                secret.source
855            )));
856        }
857
858        // Read secret content from host
859        let mut content = std::fs::read(&secret.source).map_err(|e| {
860            NucleusError::FilesystemError(format!(
861                "Failed to read secret {:?}: {}",
862                secret.source, e
863            ))
864        })?;
865
866        // Determine destination path inside the secrets tmpfs
867        let dest = resolve_container_destination(secrets_dir, &secret.dest)?;
868
869        // Create parent directories within the tmpfs
870        if let Some(parent) = dest.parent() {
871            std::fs::create_dir_all(parent).map_err(|e| {
872                NucleusError::FilesystemError(format!(
873                    "Failed to create secret parent dir {:?}: {}",
874                    parent, e
875                ))
876            })?;
877        }
878
879        // Write secret content to tmpfs
880        std::fs::write(&dest, &content).map_err(|e| {
881            NucleusError::FilesystemError(format!("Failed to write secret to {:?}: {}", dest, e))
882        })?;
883
884        // Set permissions
885        {
886            use std::os::unix::fs::PermissionsExt;
887            let perms = std::fs::Permissions::from_mode(secret.mode);
888            std::fs::set_permissions(&dest, perms).map_err(|e| {
889                NucleusError::FilesystemError(format!(
890                    "Failed to set permissions on secret {:?}: {}",
891                    dest, e
892                ))
893            })?;
894        }
895
896        // Zero the in-memory buffer
897        zeroize::Zeroize::zeroize(&mut content);
898        drop(content);
899
900        // Also bind-mount the secret to its expected container path for compatibility
901        let container_dest = resolve_container_destination(root, &secret.dest)?;
902        if container_dest != dest {
903            if let Some(parent) = container_dest.parent() {
904                std::fs::create_dir_all(parent).map_err(|e| {
905                    NucleusError::FilesystemError(format!(
906                        "Failed to create secret mount parent {:?}: {}",
907                        parent, e
908                    ))
909                })?;
910            }
911
912            if secret.source.is_file() {
913                std::fs::write(&container_dest, "").map_err(|e| {
914                    NucleusError::FilesystemError(format!(
915                        "Failed to create secret mount point {:?}: {}",
916                        container_dest, e
917                    ))
918                })?;
919            }
920
921            mount(
922                Some(dest.as_path()),
923                &container_dest,
924                None::<&str>,
925                MsFlags::MS_BIND,
926                None::<&str>,
927            )
928            .map_err(|e| {
929                NucleusError::FilesystemError(format!(
930                    "Failed to bind mount secret {:?} -> {:?}: {}",
931                    dest, container_dest, e
932                ))
933            })?;
934
935            mount(
936                None::<&str>,
937                &container_dest,
938                None::<&str>,
939                MsFlags::MS_REMOUNT
940                    | MsFlags::MS_BIND
941                    | MsFlags::MS_RDONLY
942                    | MsFlags::MS_NOSUID
943                    | MsFlags::MS_NODEV
944                    | MsFlags::MS_NOEXEC,
945                None::<&str>,
946            )
947            .map_err(|e| {
948                NucleusError::FilesystemError(format!(
949                    "Failed to remount secret {:?} read-only: {}",
950                    container_dest, e
951                ))
952            })?;
953        }
954
955        debug!(
956            "Secret {:?} -> {:?} (in-memory tmpfs, mode {:04o})",
957            secret.source, secret.dest, secret.mode
958        );
959    }
960
961    Ok(())
962}
963
964#[cfg(test)]
965mod tests {
966    use super::*;
967
968    #[test]
969    fn test_proc_mask_includes_sysrq_trigger() {
970        assert!(
971            PROC_NULL_MASKED.contains(&"sysrq-trigger"),
972            "/proc/sysrq-trigger must be masked to prevent host DoS"
973        );
974    }
975
976    #[test]
977    fn test_proc_mask_includes_timer_stats() {
978        assert!(
979            PROC_NULL_MASKED.contains(&"timer_stats"),
980            "/proc/timer_stats must be masked to prevent kernel info leakage"
981        );
982    }
983
984    #[test]
985    fn test_proc_mask_includes_kpage_files() {
986        for path in &["kpagecount", "kpageflags", "kpagecgroup"] {
987            assert!(
988                PROC_NULL_MASKED.contains(path),
989                "/proc/{} must be masked to prevent host memory layout leakage",
990                path
991            );
992        }
993    }
994
995    #[test]
996    fn test_proc_mask_includes_oci_standard_paths() {
997        // OCI runtime spec required masked paths
998        for path in &["kallsyms", "kcore", "sched_debug", "keys", "config.gz"] {
999            assert!(
1000                PROC_NULL_MASKED.contains(path),
1001                "/proc/{} must be in null-masked list (OCI spec)",
1002                path
1003            );
1004        }
1005        for path in &["acpi", "bus", "scsi", "sys"] {
1006            assert!(
1007                PROC_TMPFS_MASKED.contains(path),
1008                "/proc/{} must be in tmpfs-masked list (OCI spec)",
1009                path
1010            );
1011        }
1012    }
1013}