Skip to main content

nucleus/filesystem/
mount.rs

1use crate::error::{NucleusError, Result};
2use nix::mount::{mount, MsFlags};
3use nix::sys::stat::{makedev, mknod, Mode, SFlag};
4use nix::unistd::chroot;
5use std::path::{Component, Path, PathBuf};
6use tracing::{debug, info, warn};
7
8/// Expected mount flags for audit verification.
9struct ExpectedMount {
10    path: &'static str,
11    required_flags: &'static [&'static str],
12    /// If true, the mount *must* exist in production mode. A missing critical
13    /// mount (e.g. /proc) is treated as a violation rather than silently skipped.
14    critical: bool,
15}
16
17/// Known mount paths and the flags they must carry in production mode.
18const PRODUCTION_MOUNT_EXPECTATIONS: &[ExpectedMount] = &[
19    ExpectedMount {
20        path: "/bin",
21        required_flags: &["ro", "nosuid", "nodev"],
22        critical: true,
23    },
24    ExpectedMount {
25        path: "/usr",
26        required_flags: &["ro", "nosuid", "nodev"],
27        critical: true,
28    },
29    ExpectedMount {
30        path: "/lib",
31        required_flags: &["ro", "nosuid", "nodev"],
32        critical: false, // not all rootfs layouts have /lib
33    },
34    ExpectedMount {
35        path: "/lib64",
36        required_flags: &["ro", "nosuid", "nodev"],
37        critical: false, // not all rootfs layouts have /lib64
38    },
39    ExpectedMount {
40        path: "/etc",
41        required_flags: &["ro", "nosuid", "nodev"],
42        critical: true,
43    },
44    ExpectedMount {
45        path: "/nix",
46        required_flags: &["ro", "nosuid", "nodev"],
47        critical: false, // only present on NixOS-based rootfs
48    },
49    ExpectedMount {
50        path: "/sbin",
51        required_flags: &["ro", "nosuid", "nodev"],
52        critical: false, // not all rootfs layouts have /sbin
53    },
54    ExpectedMount {
55        path: "/proc",
56        required_flags: &["nosuid", "nodev", "noexec"],
57        critical: true,
58    },
59    ExpectedMount {
60        path: "/run/secrets",
61        required_flags: &["nosuid", "nodev", "noexec"],
62        critical: false, // only present when secrets are configured
63    },
64];
65
66/// Normalize an absolute container destination path and reject traversal.
67///
68/// Returns a normalized absolute path containing only `RootDir` and `Normal`
69/// components. `.` segments are ignored; `..` and relative paths are rejected.
70pub fn normalize_container_destination(dest: &Path) -> Result<PathBuf> {
71    if !dest.is_absolute() {
72        return Err(NucleusError::ConfigError(format!(
73            "Container destination must be absolute: {:?}",
74            dest
75        )));
76    }
77
78    let mut normalized = PathBuf::from("/");
79    let mut saw_component = false;
80
81    for component in dest.components() {
82        match component {
83            Component::RootDir => {}
84            Component::CurDir => {}
85            Component::Normal(part) => {
86                normalized.push(part);
87                saw_component = true;
88            }
89            Component::ParentDir => {
90                return Err(NucleusError::ConfigError(format!(
91                    "Container destination must not contain parent traversal: {:?}",
92                    dest
93                )));
94            }
95            Component::Prefix(_) => {
96                return Err(NucleusError::ConfigError(format!(
97                    "Unsupported container destination prefix: {:?}",
98                    dest
99                )));
100            }
101        }
102    }
103
104    if !saw_component {
105        return Err(NucleusError::ConfigError(format!(
106            "Container destination must not be the root directory: {:?}",
107            dest
108        )));
109    }
110
111    Ok(normalized)
112}
113
114/// Resolve a validated container destination under a host-side root directory.
115pub fn resolve_container_destination(root: &Path, dest: &Path) -> Result<PathBuf> {
116    let normalized = normalize_container_destination(dest)?;
117    let relative = normalized
118        .strip_prefix("/")
119        .expect("normalized container destination is always absolute");
120    Ok(root.join(relative))
121}
122
123/// Audit all mounts in the container's mount namespace.
124///
125/// Reads /proc/self/mounts and verifies that each known mount point carries
126/// its expected flags. In production mode, any missing flag is fatal.
127/// Returns Ok(()) if all checks pass, or a list of violations.
128pub fn audit_mounts(production_mode: bool) -> Result<()> {
129    let mounts_content = std::fs::read_to_string("/proc/self/mounts").map_err(|e| {
130        NucleusError::FilesystemError(format!("Failed to read /proc/self/mounts: {}", e))
131    })?;
132
133    let mut violations = Vec::new();
134
135    for expectation in PRODUCTION_MOUNT_EXPECTATIONS {
136        // Find the mount entry for this path
137        let mount_entry = mounts_content.lines().find(|line| {
138            let parts: Vec<&str> = line.split_whitespace().collect();
139            parts.len() >= 4 && parts[1] == expectation.path
140        });
141
142        if let Some(entry) = mount_entry {
143            let parts: Vec<&str> = entry.split_whitespace().collect();
144            if parts.len() >= 4 {
145                let options = parts[3];
146                for &flag in expectation.required_flags {
147                    if !options.split(',').any(|opt| opt == flag) {
148                        violations.push(format!(
149                            "Mount {} missing required flag '{}' (has: {})",
150                            expectation.path, flag, options
151                        ));
152                    }
153                }
154            }
155        } else if expectation.critical && production_mode {
156            violations.push(format!(
157                "Critical mount {} is missing from the mount namespace",
158                expectation.path
159            ));
160        }
161    }
162
163    if violations.is_empty() {
164        info!("Mount audit passed: all expected flags verified");
165        Ok(())
166    } else if production_mode {
167        Err(NucleusError::FilesystemError(format!(
168            "Mount audit failed in production mode:\n  {}",
169            violations.join("\n  ")
170        )))
171    } else {
172        for v in &violations {
173            warn!("Mount audit: {}", v);
174        }
175        Ok(())
176    }
177}
178
179/// Create minimal filesystem structure in the new root
180pub fn create_minimal_fs(root: &Path) -> Result<()> {
181    info!("Creating minimal filesystem structure at {:?}", root);
182
183    // Create essential directories
184    let dirs = vec![
185        "dev",
186        "proc",
187        "sys",
188        "tmp",
189        "bin",
190        "sbin",
191        "usr",
192        "lib",
193        "lib64",
194        "etc",
195        "nix",
196        "nix/store",
197        "run",
198        "context",
199    ];
200
201    for dir in dirs {
202        let path = root.join(dir);
203        std::fs::create_dir_all(&path).map_err(|e| {
204            NucleusError::FilesystemError(format!("Failed to create directory {:?}: {}", path, e))
205        })?;
206    }
207
208    info!("Created minimal filesystem structure");
209
210    Ok(())
211}
212
213/// Create essential device nodes in /dev
214///
215/// In rootless mode, device node creation will fail gracefully
216pub fn create_dev_nodes(dev_path: &Path, include_tty: bool) -> Result<()> {
217    info!("Creating device nodes at {:?}", dev_path);
218
219    // Device nodes: (name, type, major, minor)
220    let mut devices = vec![
221        ("null", SFlag::S_IFCHR, 1, 3),
222        ("zero", SFlag::S_IFCHR, 1, 5),
223        ("full", SFlag::S_IFCHR, 1, 7),
224        ("random", SFlag::S_IFCHR, 1, 8),
225        ("urandom", SFlag::S_IFCHR, 1, 9),
226    ];
227    if include_tty {
228        devices.push(("tty", SFlag::S_IFCHR, 5, 0));
229    }
230
231    let mut created_count = 0;
232    let mut failed_count = 0;
233
234    for (name, dev_type, major, minor) in devices {
235        let path = dev_path.join(name);
236        let mode = Mode::from_bits_truncate(0o666);
237        let dev = makedev(major, minor);
238
239        match mknod(&path, dev_type, mode, dev) {
240            Ok(_) => {
241                info!("Created device node: {:?}", path);
242                created_count += 1;
243            }
244            Err(e) => {
245                // In rootless mode, mknod fails - this is expected
246                warn!(
247                    "Failed to create device node {:?}: {} (this is normal in rootless mode)",
248                    path, e
249                );
250                failed_count += 1;
251            }
252        }
253    }
254
255    if created_count > 0 {
256        info!("Successfully created {} device nodes", created_count);
257    }
258    if failed_count > 0 {
259        info!("Skipped {} device nodes (rootless mode)", failed_count);
260    }
261
262    Ok(())
263}
264
265/// Bind mount a pre-built rootfs (e.g. a Nix store closure) into the container.
266///
267/// Instead of exposing the full host /bin, /usr, /lib, /lib64, /nix, this mounts
268/// a minimal, purpose-built root filesystem. Suitable for production services.
269pub fn bind_mount_rootfs(root: &Path, rootfs_path: &Path) -> Result<()> {
270    info!(
271        "Bind mounting production rootfs {:?} into container {:?}",
272        rootfs_path, root
273    );
274
275    if !rootfs_path.exists() {
276        return Err(NucleusError::FilesystemError(format!(
277            "Rootfs path does not exist: {:?}",
278            rootfs_path
279        )));
280    }
281
282    // Bind mount the rootfs contents into the container root.
283    // The rootfs is expected to contain a standard FHS layout (/bin, /lib, /etc, etc.)
284    // produced by a Nix buildEnv or similar.
285    let subdirs = ["bin", "sbin", "lib", "lib64", "usr", "etc", "nix"];
286
287    for subdir in &subdirs {
288        let source = rootfs_path.join(subdir);
289        if !source.exists() {
290            debug!("Rootfs subdir {} not present, skipping", subdir);
291            continue;
292        }
293
294        let target = root.join(subdir);
295        std::fs::create_dir_all(&target).map_err(|e| {
296            NucleusError::FilesystemError(format!(
297                "Failed to create mount point {:?}: {}",
298                target, e
299            ))
300        })?;
301
302        mount(
303            Some(&source),
304            &target,
305            None::<&str>,
306            MsFlags::MS_BIND | MsFlags::MS_REC,
307            None::<&str>,
308        )
309        .map_err(|e| {
310            NucleusError::FilesystemError(format!(
311                "Failed to bind mount rootfs {:?} -> {:?}: {}",
312                source, target, e
313            ))
314        })?;
315
316        // Remount read-only
317        mount(
318            None::<&str>,
319            &target,
320            None::<&str>,
321            MsFlags::MS_REMOUNT
322                | MsFlags::MS_BIND
323                | MsFlags::MS_RDONLY
324                | MsFlags::MS_REC
325                | MsFlags::MS_NOSUID
326                | MsFlags::MS_NODEV,
327            None::<&str>,
328        )
329        .map_err(|e| {
330            NucleusError::FilesystemError(format!(
331                "Failed to remount rootfs {:?} read-only: {}",
332                target, e
333            ))
334        })?;
335
336        info!("Mounted rootfs/{} read-only", subdir);
337    }
338
339    Ok(())
340}
341
342/// Bind mount essential host directories into container
343///
344/// This allows host binaries to be accessible inside the container.
345/// Used in agent mode. Production mode should use bind_mount_rootfs() instead.
346pub fn bind_mount_host_paths(root: &Path, best_effort: bool) -> Result<()> {
347    info!("Bind mounting host paths into container");
348
349    // Essential paths to bind mount (read-only)
350    let host_paths = vec![
351        "/bin", "/usr", "/lib", "/lib64", "/nix", // For NixOS
352    ];
353
354    for host_path in host_paths {
355        let host = Path::new(host_path);
356
357        // Only mount if the path exists on the host
358        if !host.exists() {
359            debug!("Skipping {} (not present on host)", host_path);
360            continue;
361        }
362
363        let container_path = root.join(host_path.trim_start_matches('/'));
364
365        // Create mount point
366        if let Err(e) = std::fs::create_dir_all(&container_path) {
367            if best_effort {
368                warn!("Failed to create mount point {:?}: {}", container_path, e);
369                continue;
370            }
371            return Err(NucleusError::FilesystemError(format!(
372                "Failed to create mount point {:?}: {}",
373                container_path, e
374            )));
375        }
376
377        // Attempt bind mount
378        // Note: Linux ignores MS_RDONLY on the initial bind mount call.
379        // A second remount is required to actually enforce read-only.
380        match mount(
381            Some(host),
382            &container_path,
383            None::<&str>,
384            MsFlags::MS_BIND | MsFlags::MS_REC,
385            None::<&str>,
386        ) {
387            Ok(_) => {
388                // Remount as read-only – required because MS_RDONLY is ignored on initial bind
389                mount(
390                    None::<&str>,
391                    &container_path,
392                    None::<&str>,
393                    MsFlags::MS_REMOUNT
394                        | MsFlags::MS_BIND
395                        | MsFlags::MS_RDONLY
396                        | MsFlags::MS_REC
397                        | MsFlags::MS_NOSUID
398                        | MsFlags::MS_NODEV,
399                    None::<&str>,
400                )
401                .map_err(|e| {
402                    NucleusError::FilesystemError(format!(
403                        "Failed to remount {} as read-only: {}",
404                        host_path, e
405                    ))
406                })?;
407                info!(
408                    "Bind mounted {} to {:?} (read-only)",
409                    host_path, container_path
410                );
411            }
412            Err(e) => {
413                if best_effort {
414                    warn!(
415                        "Failed to bind mount {}: {} (continuing anyway)",
416                        host_path, e
417                    );
418                } else {
419                    return Err(NucleusError::FilesystemError(format!(
420                        "Failed to bind mount {}: {}",
421                        host_path, e
422                    )));
423                }
424            }
425        }
426    }
427
428    Ok(())
429}
430
431/// Mount persistent bind volumes and ephemeral tmpfs volumes into the container root.
432pub fn mount_volumes(root: &Path, volumes: &[crate::container::VolumeMount]) -> Result<()> {
433    use crate::container::VolumeSource;
434
435    if volumes.is_empty() {
436        return Ok(());
437    }
438
439    info!("Mounting {} volume(s) into container", volumes.len());
440
441    for volume in volumes {
442        let dest = resolve_container_destination(root, &volume.dest)?;
443
444        match &volume.source {
445            VolumeSource::Bind { source } => {
446                if !source.exists() {
447                    return Err(NucleusError::FilesystemError(format!(
448                        "Volume source does not exist: {:?}",
449                        source
450                    )));
451                }
452
453                if let Some(parent) = dest.parent() {
454                    std::fs::create_dir_all(parent).map_err(|e| {
455                        NucleusError::FilesystemError(format!(
456                            "Failed to create volume mount parent {:?}: {}",
457                            parent, e
458                        ))
459                    })?;
460                }
461
462                let recursive = source.is_dir();
463                if source.is_file() {
464                    std::fs::write(&dest, "").map_err(|e| {
465                        NucleusError::FilesystemError(format!(
466                            "Failed to create volume mount point {:?}: {}",
467                            dest, e
468                        ))
469                    })?;
470                } else {
471                    std::fs::create_dir_all(&dest).map_err(|e| {
472                        NucleusError::FilesystemError(format!(
473                            "Failed to create volume mount dir {:?}: {}",
474                            dest, e
475                        ))
476                    })?;
477                }
478
479                let initial_flags = if recursive {
480                    MsFlags::MS_BIND | MsFlags::MS_REC
481                } else {
482                    MsFlags::MS_BIND
483                };
484                mount(
485                    Some(source.as_path()),
486                    &dest,
487                    None::<&str>,
488                    initial_flags,
489                    None::<&str>,
490                )
491                .map_err(|e| {
492                    NucleusError::FilesystemError(format!(
493                        "Failed to bind mount volume {:?} -> {:?}: {}",
494                        source, dest, e
495                    ))
496                })?;
497
498                let mut remount_flags =
499                    MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_NOSUID | MsFlags::MS_NODEV;
500                if recursive {
501                    remount_flags |= MsFlags::MS_REC;
502                }
503                if volume.read_only {
504                    remount_flags |= MsFlags::MS_RDONLY;
505                }
506
507                mount(
508                    None::<&str>,
509                    &dest,
510                    None::<&str>,
511                    remount_flags,
512                    None::<&str>,
513                )
514                .map_err(|e| {
515                    NucleusError::FilesystemError(format!(
516                        "Failed to remount volume {:?} with final flags: {}",
517                        dest, e
518                    ))
519                })?;
520
521                info!(
522                    "Mounted bind volume {:?} -> {:?} ({})",
523                    source,
524                    volume.dest,
525                    if volume.read_only { "ro" } else { "rw" }
526                );
527            }
528            VolumeSource::Tmpfs { size } => {
529                std::fs::create_dir_all(&dest).map_err(|e| {
530                    NucleusError::FilesystemError(format!(
531                        "Failed to create tmpfs mount dir {:?}: {}",
532                        dest, e
533                    ))
534                })?;
535
536                let mount_data = size
537                    .as_ref()
538                    .map(|value| format!("size={},mode=0755", value))
539                    .unwrap_or_else(|| "mode=0755".to_string());
540
541                let mut flags = MsFlags::MS_NOSUID | MsFlags::MS_NODEV;
542                if volume.read_only {
543                    flags |= MsFlags::MS_RDONLY;
544                }
545                mount(
546                    Some("tmpfs"),
547                    &dest,
548                    Some("tmpfs"),
549                    flags,
550                    Some(mount_data.as_str()),
551                )
552                .map_err(|e| {
553                    NucleusError::FilesystemError(format!(
554                        "Failed to mount tmpfs volume at {:?}: {}",
555                        dest, e
556                    ))
557                })?;
558
559                info!(
560                    "Mounted tmpfs volume at {:?}{}{}",
561                    volume.dest,
562                    size.as_ref()
563                        .map(|value| format!(" (size={})", value))
564                        .unwrap_or_default(),
565                    if volume.read_only { " (ro)" } else { "" }
566                );
567            }
568        }
569    }
570
571    Ok(())
572}
573
574/// Mount procfs at the given path
575///
576/// In rootless mode, procfs mounting should work due to user namespace capabilities.
577/// When `hide_pids` is true, mounts with hidepid=2 so processes cannot enumerate
578/// other PIDs (production hardening).
579pub fn mount_procfs(
580    proc_path: &Path,
581    best_effort: bool,
582    read_only: bool,
583    hide_pids: bool,
584) -> Result<()> {
585    info!(
586        "Mounting procfs at {:?} (hidepid={})",
587        proc_path,
588        if hide_pids { "2" } else { "0" }
589    );
590
591    let mount_data: Option<&str> = if hide_pids { Some("hidepid=2") } else { None };
592
593    match mount(
594        Some("proc"),
595        proc_path,
596        Some("proc"),
597        MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
598        mount_data,
599    ) {
600        Ok(_) => {
601            if read_only {
602                mount(
603                    None::<&str>,
604                    proc_path,
605                    None::<&str>,
606                    MsFlags::MS_REMOUNT
607                        | MsFlags::MS_RDONLY
608                        | MsFlags::MS_NOSUID
609                        | MsFlags::MS_NODEV
610                        | MsFlags::MS_NOEXEC,
611                    None::<&str>,
612                )
613                .map_err(|e| {
614                    NucleusError::FilesystemError(format!(
615                        "Failed to remount procfs read-only: {}",
616                        e
617                    ))
618                })?;
619                info!("Successfully mounted procfs (read-only)");
620            } else {
621                info!("Successfully mounted procfs");
622            }
623            Ok(())
624        }
625        Err(e) => {
626            if best_effort {
627                warn!("Failed to mount procfs: {} (continuing anyway)", e);
628                Ok(())
629            } else {
630                Err(NucleusError::FilesystemError(format!(
631                    "Failed to mount procfs: {}",
632                    e
633                )))
634            }
635        }
636    }
637}
638
639/// Paths to mask with /dev/null (files) — matches OCI runtime spec masked paths.
640/// Exposed for testing; the canonical list of sensitive /proc entries that must
641/// be hidden from container processes.
642pub const PROC_NULL_MASKED: &[&str] = &[
643    "kallsyms",
644    "kcore",
645    "sched_debug",
646    "timer_list",
647    "timer_stats",
648    "keys",
649    "latency_stats",
650    "config.gz",
651    "sysrq-trigger",
652    "kpagecount",
653    "kpageflags",
654    "kpagecgroup",
655];
656
657/// Paths to mask with empty tmpfs (directories).
658pub const PROC_TMPFS_MASKED: &[&str] = &["acpi", "bus", "irq", "scsi", "sys"];
659
660/// Mask sensitive /proc paths by bind-mounting /dev/null or tmpfs over them
661///
662/// This reduces kernel information leakage from the container. Follows OCI runtime
663/// conventions for masked paths.
664///
665/// SEC-06: When `production` is true, failures to mask critical paths
666/// (kcore, kallsyms, sysrq-trigger) are fatal instead of warn-and-continue.
667pub fn mask_proc_paths(proc_path: &Path, production: bool) -> Result<()> {
668    info!("Masking sensitive /proc paths");
669
670    const CRITICAL_PROC_PATHS: &[&str] = &["kcore", "kallsyms", "sysrq-trigger"];
671
672    let dev_null = Path::new("/dev/null");
673
674    for name in PROC_NULL_MASKED {
675        let target = proc_path.join(name);
676        if !target.exists() {
677            continue;
678        }
679        match mount(
680            Some(dev_null),
681            &target,
682            None::<&str>,
683            MsFlags::MS_BIND,
684            None::<&str>,
685        ) {
686            Ok(_) => {
687                // Remount read-only: Linux ignores MS_RDONLY on the initial bind mount,
688                // so a separate MS_REMOUNT|MS_BIND|MS_RDONLY call is required.
689                if let Err(e) = mount(
690                    None::<&str>,
691                    &target,
692                    None::<&str>,
693                    MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_RDONLY,
694                    None::<&str>,
695                ) {
696                    if production && CRITICAL_PROC_PATHS.contains(name) {
697                        return Err(NucleusError::FilesystemError(format!(
698                            "Failed to remount /proc/{} read-only in production mode: {}",
699                            name, e
700                        )));
701                    }
702                    warn!(
703                        "Failed to remount /proc/{} read-only: {} (continuing)",
704                        name, e
705                    );
706                }
707                debug!("Masked /proc/{} (read-only)", name);
708            }
709            Err(e) => {
710                if production && CRITICAL_PROC_PATHS.contains(name) {
711                    return Err(NucleusError::FilesystemError(format!(
712                        "Failed to mask critical /proc/{} in production mode: {}",
713                        name, e
714                    )));
715                }
716                warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
717            }
718        }
719    }
720
721    for name in PROC_TMPFS_MASKED {
722        let target = proc_path.join(name);
723        if !target.exists() {
724            continue;
725        }
726        match mount(
727            Some("tmpfs"),
728            &target,
729            Some("tmpfs"),
730            MsFlags::MS_RDONLY | MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
731            Some("size=0"),
732        ) {
733            Ok(_) => debug!("Masked /proc/{}", name),
734            Err(e) => {
735                if production {
736                    return Err(NucleusError::FilesystemError(format!(
737                        "Failed to mask /proc/{} in production mode: {}",
738                        name, e
739                    )));
740                }
741                warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
742            }
743        }
744    }
745
746    info!("Finished masking sensitive /proc paths");
747    Ok(())
748}
749
750/// Switch to new root filesystem using pivot_root or chroot
751///
752/// This implements the transition: populated -> pivoted
753/// Fails closed if root switching cannot be established.
754pub fn switch_root(new_root: &Path, allow_chroot_fallback: bool) -> Result<()> {
755    info!("Switching root to {:?}", new_root);
756
757    match pivot_root_impl(new_root) {
758        Ok(()) => {
759            info!("Successfully switched root using pivot_root");
760            Ok(())
761        }
762        Err(e) => {
763            if allow_chroot_fallback {
764                warn!(
765                    "pivot_root failed ({}), falling back to chroot due to explicit \
766                     configuration",
767                    e
768                );
769                chroot_impl(new_root)
770            } else {
771                Err(NucleusError::PivotRootError(format!(
772                    "pivot_root failed: {}. chroot fallback is disabled by default; use \
773                     --allow-chroot-fallback to allow weaker isolation",
774                    e
775                )))
776            }
777        }
778    }
779}
780
781/// Implement root switch using pivot_root(2)
782///
783/// pivot_root is preferred over chroot because:
784/// - More secure (old root can be unmounted)
785/// - Works better with mount namespaces
786fn pivot_root_impl(new_root: &Path) -> Result<()> {
787    use nix::unistd::pivot_root;
788
789    // pivot_root requires new_root to be a mount point
790    // and old_root to be under new_root
791
792    let old_root = new_root.join(".old_root");
793    std::fs::create_dir_all(&old_root).map_err(|e| {
794        NucleusError::PivotRootError(format!("Failed to create old_root directory: {}", e))
795    })?;
796
797    // Perform pivot_root
798    pivot_root(new_root, &old_root)
799        .map_err(|e| NucleusError::PivotRootError(format!("pivot_root syscall failed: {}", e)))?;
800
801    // Change to new root
802    std::env::set_current_dir("/")
803        .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
804
805    // Unmount old root
806    nix::mount::umount2("/.old_root", nix::mount::MntFlags::MNT_DETACH)
807        .map_err(|e| NucleusError::PivotRootError(format!("Failed to unmount old root: {}", e)))?;
808
809    // Remove old root directory
810    let _ = std::fs::remove_dir("/.old_root");
811
812    Ok(())
813}
814
815/// Implement root switch using chroot(2)
816///
817/// chroot is less secure than pivot_root but works in more situations
818fn chroot_impl(new_root: &Path) -> Result<()> {
819    chroot(new_root)
820        .map_err(|e| NucleusError::PivotRootError(format!("chroot syscall failed: {}", e)))?;
821
822    // Change to new root
823    std::env::set_current_dir("/")
824        .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
825
826    info!("Successfully switched root using chroot");
827
828    Ok(())
829}
830
831/// Mount secret files into the container root.
832///
833/// Each secret is bind-mounted read-only from its source to the destination
834/// path inside the container. Intermediate directories are created as needed.
835pub fn mount_secrets(root: &Path, secrets: &[crate::container::SecretMount]) -> Result<()> {
836    if secrets.is_empty() {
837        return Ok(());
838    }
839
840    info!("Mounting {} secret(s) into container", secrets.len());
841
842    for secret in secrets {
843        if !secret.source.exists() {
844            return Err(NucleusError::FilesystemError(format!(
845                "Secret source does not exist: {:?}",
846                secret.source
847            )));
848        }
849
850        // Destination inside container root
851        let dest = resolve_container_destination(root, &secret.dest)?;
852
853        // Create parent directories
854        if let Some(parent) = dest.parent() {
855            std::fs::create_dir_all(parent).map_err(|e| {
856                NucleusError::FilesystemError(format!(
857                    "Failed to create secret mount parent {:?}: {}",
858                    parent, e
859                ))
860            })?;
861        }
862
863        // Create mount point file
864        if secret.source.is_file() {
865            std::fs::write(&dest, "").map_err(|e| {
866                NucleusError::FilesystemError(format!(
867                    "Failed to create secret mount point {:?}: {}",
868                    dest, e
869                ))
870            })?;
871        } else {
872            std::fs::create_dir_all(&dest).map_err(|e| {
873                NucleusError::FilesystemError(format!(
874                    "Failed to create secret mount dir {:?}: {}",
875                    dest, e
876                ))
877            })?;
878        }
879
880        // Bind mount read-only
881        mount(
882            Some(secret.source.as_path()),
883            &dest,
884            None::<&str>,
885            MsFlags::MS_BIND,
886            None::<&str>,
887        )
888        .map_err(|e| {
889            NucleusError::FilesystemError(format!(
890                "Failed to bind mount secret {:?}: {}",
891                secret.source, e
892            ))
893        })?;
894
895        mount(
896            None::<&str>,
897            &dest,
898            None::<&str>,
899            MsFlags::MS_REMOUNT
900                | MsFlags::MS_BIND
901                | MsFlags::MS_RDONLY
902                | MsFlags::MS_NOSUID
903                | MsFlags::MS_NODEV
904                | MsFlags::MS_NOEXEC,
905            None::<&str>,
906        )
907        .map_err(|e| {
908            NucleusError::FilesystemError(format!(
909                "Failed to remount secret {:?} read-only: {}",
910                dest, e
911            ))
912        })?;
913
914        // Apply configured file permissions on the mount point
915        if secret.source.is_file() {
916            use std::os::unix::fs::PermissionsExt;
917            let perms = std::fs::Permissions::from_mode(secret.mode);
918            if let Err(e) = std::fs::set_permissions(&dest, perms) {
919                warn!(
920                    "Failed to set mode {:04o} on secret {:?}: {} (bind mount may override)",
921                    secret.mode, dest, e
922                );
923            }
924        }
925
926        debug!(
927            "Mounted secret {:?} -> {:?} (mode {:04o})",
928            secret.source, secret.dest, secret.mode
929        );
930    }
931
932    Ok(())
933}
934
935/// Mount secrets onto a dedicated in-memory tmpfs instead of bind-mounting host paths.
936///
937/// Creates a per-container tmpfs at `<root>/run/secrets` with MS_NOEXEC | MS_NOSUID | MS_NODEV,
938/// copies secret contents into it, then zeros the read buffer. This ensures secrets
939/// never reference host-side files after setup and are never persisted to disk.
940pub fn mount_secrets_inmemory(
941    root: &Path,
942    secrets: &[crate::container::SecretMount],
943    identity: &crate::container::ProcessIdentity,
944) -> Result<()> {
945    if secrets.is_empty() {
946        return Ok(());
947    }
948
949    info!("Mounting {} secret(s) on in-memory tmpfs", secrets.len());
950
951    let secrets_dir = root.join("run/secrets");
952    std::fs::create_dir_all(&secrets_dir).map_err(|e| {
953        NucleusError::FilesystemError(format!(
954            "Failed to create secrets dir {:?}: {}",
955            secrets_dir, e
956        ))
957    })?;
958
959    // Mount a size-limited tmpfs for secrets (16 MiB max)
960    if let Err(e) = mount(
961        Some("tmpfs"),
962        &secrets_dir,
963        Some("tmpfs"),
964        MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
965        Some("size=16m,mode=0700"),
966    ) {
967        let _ = std::fs::remove_dir_all(&secrets_dir);
968        return Err(NucleusError::FilesystemError(format!(
969            "Failed to mount secrets tmpfs at {:?}: {}",
970            secrets_dir, e
971        )));
972    }
973
974    if !identity.is_root() {
975        nix::unistd::chown(
976            &secrets_dir,
977            Some(nix::unistd::Uid::from_raw(identity.uid)),
978            Some(nix::unistd::Gid::from_raw(identity.gid)),
979        )
980        .map_err(|e| {
981            let _ = nix::mount::umount2(&secrets_dir, nix::mount::MntFlags::MNT_DETACH);
982            let _ = std::fs::remove_dir_all(&secrets_dir);
983            NucleusError::FilesystemError(format!(
984                "Failed to set /run/secrets owner to {}:{}: {}",
985                identity.uid, identity.gid, e
986            ))
987        })?;
988    }
989
990    // Rollback: unmount tmpfs and remove dir if any secret fails
991    let result = mount_secrets_inmemory_inner(&secrets_dir, root, secrets, identity);
992    if let Err(ref e) = result {
993        let _ = nix::mount::umount2(&secrets_dir, nix::mount::MntFlags::MNT_DETACH);
994        let _ = std::fs::remove_dir_all(&secrets_dir);
995        return Err(NucleusError::FilesystemError(format!(
996            "Secret mount failed (rolled back): {}",
997            e
998        )));
999    }
1000
1001    info!("All secrets mounted on in-memory tmpfs");
1002    Ok(())
1003}
1004
1005fn mount_secrets_inmemory_inner(
1006    secrets_dir: &Path,
1007    root: &Path,
1008    secrets: &[crate::container::SecretMount],
1009    identity: &crate::container::ProcessIdentity,
1010) -> Result<()> {
1011    for secret in secrets {
1012        if !secret.source.exists() {
1013            return Err(NucleusError::FilesystemError(format!(
1014                "Secret source does not exist: {:?}",
1015                secret.source
1016            )));
1017        }
1018
1019        // Read secret content from host
1020        let mut content = std::fs::read(&secret.source).map_err(|e| {
1021            NucleusError::FilesystemError(format!(
1022                "Failed to read secret {:?}: {}",
1023                secret.source, e
1024            ))
1025        })?;
1026
1027        // Determine destination path inside the secrets tmpfs
1028        let dest = resolve_container_destination(secrets_dir, &secret.dest)?;
1029
1030        // Create parent directories within the tmpfs
1031        if let Some(parent) = dest.parent() {
1032            std::fs::create_dir_all(parent).map_err(|e| {
1033                NucleusError::FilesystemError(format!(
1034                    "Failed to create secret parent dir {:?}: {}",
1035                    parent, e
1036                ))
1037            })?;
1038        }
1039
1040        // Write secret content to tmpfs
1041        std::fs::write(&dest, &content).map_err(|e| {
1042            NucleusError::FilesystemError(format!("Failed to write secret to {:?}: {}", dest, e))
1043        })?;
1044
1045        // Set permissions
1046        {
1047            use std::os::unix::fs::PermissionsExt;
1048            let perms = std::fs::Permissions::from_mode(secret.mode);
1049            std::fs::set_permissions(&dest, perms).map_err(|e| {
1050                NucleusError::FilesystemError(format!(
1051                    "Failed to set permissions on secret {:?}: {}",
1052                    dest, e
1053                ))
1054            })?;
1055        }
1056
1057        if !identity.is_root() {
1058            nix::unistd::chown(
1059                &dest,
1060                Some(nix::unistd::Uid::from_raw(identity.uid)),
1061                Some(nix::unistd::Gid::from_raw(identity.gid)),
1062            )
1063            .map_err(|e| {
1064                NucleusError::FilesystemError(format!(
1065                    "Failed to set permissions owner on secret {:?} to {}:{}: {}",
1066                    dest, identity.uid, identity.gid, e
1067                ))
1068            })?;
1069        }
1070
1071        // Zero the in-memory buffer
1072        zeroize::Zeroize::zeroize(&mut content);
1073        drop(content);
1074
1075        // Also bind-mount the secret to its expected container path for compatibility
1076        let container_dest = resolve_container_destination(root, &secret.dest)?;
1077        if container_dest != dest {
1078            if let Some(parent) = container_dest.parent() {
1079                std::fs::create_dir_all(parent).map_err(|e| {
1080                    NucleusError::FilesystemError(format!(
1081                        "Failed to create secret mount parent {:?}: {}",
1082                        parent, e
1083                    ))
1084                })?;
1085            }
1086
1087            if secret.source.is_file() {
1088                std::fs::write(&container_dest, "").map_err(|e| {
1089                    NucleusError::FilesystemError(format!(
1090                        "Failed to create secret mount point {:?}: {}",
1091                        container_dest, e
1092                    ))
1093                })?;
1094            }
1095
1096            mount(
1097                Some(dest.as_path()),
1098                &container_dest,
1099                None::<&str>,
1100                MsFlags::MS_BIND,
1101                None::<&str>,
1102            )
1103            .map_err(|e| {
1104                NucleusError::FilesystemError(format!(
1105                    "Failed to bind mount secret {:?} -> {:?}: {}",
1106                    dest, container_dest, e
1107                ))
1108            })?;
1109
1110            mount(
1111                None::<&str>,
1112                &container_dest,
1113                None::<&str>,
1114                MsFlags::MS_REMOUNT
1115                    | MsFlags::MS_BIND
1116                    | MsFlags::MS_RDONLY
1117                    | MsFlags::MS_NOSUID
1118                    | MsFlags::MS_NODEV
1119                    | MsFlags::MS_NOEXEC,
1120                None::<&str>,
1121            )
1122            .map_err(|e| {
1123                NucleusError::FilesystemError(format!(
1124                    "Failed to remount secret {:?} read-only: {}",
1125                    container_dest, e
1126                ))
1127            })?;
1128        }
1129
1130        debug!(
1131            "Secret {:?} -> {:?} (in-memory tmpfs, mode {:04o})",
1132            secret.source, secret.dest, secret.mode
1133        );
1134    }
1135
1136    Ok(())
1137}
1138
1139#[cfg(test)]
1140mod tests {
1141    use super::*;
1142
1143    #[test]
1144    fn test_proc_mask_includes_sysrq_trigger() {
1145        assert!(
1146            PROC_NULL_MASKED.contains(&"sysrq-trigger"),
1147            "/proc/sysrq-trigger must be masked to prevent host DoS"
1148        );
1149    }
1150
1151    #[test]
1152    fn test_proc_mask_includes_timer_stats() {
1153        assert!(
1154            PROC_NULL_MASKED.contains(&"timer_stats"),
1155            "/proc/timer_stats must be masked to prevent kernel info leakage"
1156        );
1157    }
1158
1159    #[test]
1160    fn test_proc_mask_includes_kpage_files() {
1161        for path in &["kpagecount", "kpageflags", "kpagecgroup"] {
1162            assert!(
1163                PROC_NULL_MASKED.contains(path),
1164                "/proc/{} must be masked to prevent host memory layout leakage",
1165                path
1166            );
1167        }
1168    }
1169
1170    #[test]
1171    fn test_proc_mask_includes_oci_standard_paths() {
1172        // OCI runtime spec required masked paths
1173        for path in &["kallsyms", "kcore", "sched_debug", "keys", "config.gz"] {
1174            assert!(
1175                PROC_NULL_MASKED.contains(path),
1176                "/proc/{} must be in null-masked list (OCI spec)",
1177                path
1178            );
1179        }
1180        for path in &["acpi", "bus", "scsi", "sys"] {
1181            assert!(
1182                PROC_TMPFS_MASKED.contains(path),
1183                "/proc/{} must be in tmpfs-masked list (OCI spec)",
1184                path
1185            );
1186        }
1187    }
1188}