Skip to main content

nucleus/filesystem/
mount.rs

1use crate::error::{NucleusError, Result};
2use nix::fcntl::{open, OFlag};
3use nix::mount::{mount, MsFlags};
4use nix::sys::stat::{fstat, makedev, mknod, Mode, SFlag};
5use nix::unistd::chroot;
6use std::fs::OpenOptions;
7use std::io::Read;
8use std::os::fd::AsRawFd;
9use std::os::unix::fs::OpenOptionsExt;
10use std::path::{Component, Path, PathBuf};
11use tracing::{debug, info, warn};
12
13/// Expected mount flags for audit verification.
14struct ExpectedMount {
15    path: &'static str,
16    required_flags: &'static [&'static str],
17    /// If true, the mount *must* exist in production mode. A missing critical
18    /// mount (e.g. /proc) is treated as a violation rather than silently skipped.
19    critical: bool,
20}
21
22/// Known mount paths and the flags they must carry in production mode.
23const PRODUCTION_MOUNT_EXPECTATIONS: &[ExpectedMount] = &[
24    ExpectedMount {
25        path: "/bin",
26        required_flags: &["ro", "nosuid", "nodev"],
27        critical: true,
28    },
29    ExpectedMount {
30        path: "/usr",
31        required_flags: &["ro", "nosuid", "nodev"],
32        critical: true,
33    },
34    ExpectedMount {
35        path: "/lib",
36        required_flags: &["ro", "nosuid", "nodev"],
37        critical: false, // not all rootfs layouts have /lib
38    },
39    ExpectedMount {
40        path: "/lib64",
41        required_flags: &["ro", "nosuid", "nodev"],
42        critical: false, // not all rootfs layouts have /lib64
43    },
44    ExpectedMount {
45        path: "/etc",
46        required_flags: &["ro", "nosuid", "nodev"],
47        critical: true,
48    },
49    ExpectedMount {
50        path: "/nix",
51        required_flags: &["ro", "nosuid", "nodev"],
52        critical: false, // only present on NixOS-based rootfs
53    },
54    ExpectedMount {
55        path: "/sbin",
56        required_flags: &["ro", "nosuid", "nodev"],
57        critical: false, // not all rootfs layouts have /sbin
58    },
59    ExpectedMount {
60        path: "/proc",
61        required_flags: &["nosuid", "nodev", "noexec"],
62        critical: true,
63    },
64    ExpectedMount {
65        path: "/run/secrets",
66        required_flags: &["nosuid", "nodev", "noexec"],
67        critical: false, // only present when secrets are configured
68    },
69];
70
71/// Normalize an absolute container destination path and reject traversal.
72///
73/// Returns a normalized absolute path containing only `RootDir` and `Normal`
74/// components. `.` segments are ignored; `..` and relative paths are rejected.
75pub fn normalize_container_destination(dest: &Path) -> Result<PathBuf> {
76    if !dest.is_absolute() {
77        return Err(NucleusError::ConfigError(format!(
78            "Container destination must be absolute: {:?}",
79            dest
80        )));
81    }
82
83    let mut normalized = PathBuf::from("/");
84    let mut saw_component = false;
85
86    for component in dest.components() {
87        match component {
88            Component::RootDir => {}
89            Component::CurDir => {}
90            Component::Normal(part) => {
91                normalized.push(part);
92                saw_component = true;
93            }
94            Component::ParentDir => {
95                return Err(NucleusError::ConfigError(format!(
96                    "Container destination must not contain parent traversal: {:?}",
97                    dest
98                )));
99            }
100            Component::Prefix(_) => {
101                return Err(NucleusError::ConfigError(format!(
102                    "Unsupported container destination prefix: {:?}",
103                    dest
104                )));
105            }
106        }
107    }
108
109    if !saw_component {
110        return Err(NucleusError::ConfigError(format!(
111            "Container destination must not be the root directory: {:?}",
112            dest
113        )));
114    }
115
116    Ok(normalized)
117}
118
119/// Resolve a validated container destination under a host-side root directory.
120pub fn resolve_container_destination(root: &Path, dest: &Path) -> Result<PathBuf> {
121    let normalized = normalize_container_destination(dest)?;
122    let relative = normalized.strip_prefix("/").map_err(|_| {
123        NucleusError::ConfigError(format!(
124            "Container destination is not absolute after normalization: {:?}",
125            normalized
126        ))
127    })?;
128    Ok(root.join(relative))
129}
130
131pub(crate) fn read_regular_file_nofollow(path: &Path) -> Result<Vec<u8>> {
132    let mut file = OpenOptions::new()
133        .read(true)
134        .custom_flags(libc::O_NOFOLLOW | libc::O_CLOEXEC)
135        .open(path)
136        .map_err(|e| {
137            NucleusError::FilesystemError(format!(
138                "Failed to open file {:?} with O_NOFOLLOW: {}",
139                path, e
140            ))
141        })?;
142
143    let metadata = file.metadata().map_err(|e| {
144        NucleusError::FilesystemError(format!("Failed to stat file {:?}: {}", path, e))
145    })?;
146    if !metadata.is_file() {
147        return Err(NucleusError::FilesystemError(format!(
148            "Expected regular file for {:?}, found non-file source",
149            path
150        )));
151    }
152
153    let mut content = Vec::new();
154    file.read_to_end(&mut content).map_err(|e| {
155        NucleusError::FilesystemError(format!("Failed to read file {:?}: {}", path, e))
156    })?;
157    Ok(content)
158}
159
160/// Audit all mounts in the container's mount namespace.
161///
162/// Reads /proc/self/mounts and verifies that each known mount point carries
163/// its expected flags. In production mode, any missing flag is fatal.
164/// Returns Ok(()) if all checks pass, or a list of violations.
165pub fn audit_mounts(production_mode: bool) -> Result<()> {
166    let mounts_content = std::fs::read_to_string("/proc/self/mounts").map_err(|e| {
167        NucleusError::FilesystemError(format!("Failed to read /proc/self/mounts: {}", e))
168    })?;
169
170    let mut violations = Vec::new();
171
172    for expectation in PRODUCTION_MOUNT_EXPECTATIONS {
173        // Find the mount entry for this path
174        let mount_entry = mounts_content.lines().find(|line| {
175            let parts: Vec<&str> = line.split_whitespace().collect();
176            parts.len() >= 4 && parts[1] == expectation.path
177        });
178
179        if let Some(entry) = mount_entry {
180            let parts: Vec<&str> = entry.split_whitespace().collect();
181            if parts.len() >= 4 {
182                let options = parts[3];
183                for &flag in expectation.required_flags {
184                    if !options.split(',').any(|opt| opt == flag) {
185                        violations.push(format!(
186                            "Mount {} missing required flag '{}' (has: {})",
187                            expectation.path, flag, options
188                        ));
189                    }
190                }
191            }
192        } else if expectation.critical && production_mode {
193            violations.push(format!(
194                "Critical mount {} is missing from the mount namespace",
195                expectation.path
196            ));
197        }
198    }
199
200    if violations.is_empty() {
201        info!("Mount audit passed: all expected flags verified");
202        Ok(())
203    } else if production_mode {
204        Err(NucleusError::FilesystemError(format!(
205            "Mount audit failed in production mode:\n  {}",
206            violations.join("\n  ")
207        )))
208    } else {
209        for v in &violations {
210            warn!("Mount audit: {}", v);
211        }
212        Ok(())
213    }
214}
215
216/// Create minimal filesystem structure in the new root
217pub fn create_minimal_fs(root: &Path) -> Result<()> {
218    info!("Creating minimal filesystem structure at {:?}", root);
219
220    // Create essential directories
221    let dirs = vec![
222        "dev",
223        "proc",
224        "sys",
225        "tmp",
226        "bin",
227        "sbin",
228        "usr",
229        "lib",
230        "lib64",
231        "etc",
232        "nix",
233        "nix/store",
234        "run",
235        "context",
236    ];
237
238    for dir in dirs {
239        let path = root.join(dir);
240        std::fs::create_dir_all(&path).map_err(|e| {
241            NucleusError::FilesystemError(format!("Failed to create directory {:?}: {}", path, e))
242        })?;
243    }
244
245    info!("Created minimal filesystem structure");
246
247    Ok(())
248}
249
250/// Create essential device nodes in /dev
251///
252/// In rootless mode, device node creation will fail gracefully
253pub fn create_dev_nodes(dev_path: &Path, include_tty: bool) -> Result<()> {
254    info!("Creating device nodes at {:?}", dev_path);
255
256    // Device nodes: (name, type, major, minor)
257    let mut devices = vec![
258        ("null", SFlag::S_IFCHR, 1, 3),
259        ("zero", SFlag::S_IFCHR, 1, 5),
260        ("full", SFlag::S_IFCHR, 1, 7),
261        ("random", SFlag::S_IFCHR, 1, 8),
262        ("urandom", SFlag::S_IFCHR, 1, 9),
263    ];
264    if include_tty {
265        devices.push(("tty", SFlag::S_IFCHR, 5, 0));
266    }
267
268    let mut created_count = 0;
269    let mut failed_count = 0;
270
271    for (name, dev_type, major, minor) in devices {
272        let path = dev_path.join(name);
273        let mode = Mode::from_bits_truncate(0o660);
274        let dev = makedev(major, minor);
275
276        match mknod(&path, dev_type, mode, dev) {
277            Ok(_) => {
278                info!("Created device node: {:?}", path);
279                created_count += 1;
280            }
281            Err(e) => {
282                // In rootless mode, mknod fails - this is expected
283                warn!(
284                    "Failed to create device node {:?}: {} (this is normal in rootless mode)",
285                    path, e
286                );
287                failed_count += 1;
288            }
289        }
290    }
291
292    if created_count > 0 {
293        info!("Successfully created {} device nodes", created_count);
294    }
295    if failed_count > 0 {
296        info!("Skipped {} device nodes (rootless mode)", failed_count);
297    }
298
299    Ok(())
300}
301
302/// Bind mount a pre-built rootfs (e.g. a Nix store closure) into the container.
303///
304/// Instead of exposing the full host /bin, /usr, /lib, /lib64, /nix, this mounts
305/// a minimal, purpose-built root filesystem. Suitable for production services.
306pub fn bind_mount_rootfs(root: &Path, rootfs_path: &Path) -> Result<()> {
307    info!(
308        "Bind mounting production rootfs {:?} into container {:?}",
309        rootfs_path, root
310    );
311
312    if std::fs::symlink_metadata(rootfs_path).is_err() {
313        return Err(NucleusError::FilesystemError(format!(
314            "Rootfs path does not exist: {:?}",
315            rootfs_path
316        )));
317    }
318
319    // Bind mount the rootfs contents into the container root.
320    // The rootfs is expected to contain a standard FHS layout (/bin, /lib, /etc, etc.)
321    // produced by a Nix buildEnv or similar.
322    let subdirs = ["bin", "sbin", "lib", "lib64", "usr", "etc", "nix"];
323
324    for subdir in &subdirs {
325        let source = rootfs_path.join(subdir);
326        if !source.exists() {
327            debug!("Rootfs subdir {} not present, skipping", subdir);
328            continue;
329        }
330
331        let target = root.join(subdir);
332        std::fs::create_dir_all(&target).map_err(|e| {
333            NucleusError::FilesystemError(format!(
334                "Failed to create mount point {:?}: {}",
335                target, e
336            ))
337        })?;
338
339        mount(
340            Some(&source),
341            &target,
342            None::<&str>,
343            MsFlags::MS_BIND | MsFlags::MS_REC,
344            None::<&str>,
345        )
346        .map_err(|e| {
347            NucleusError::FilesystemError(format!(
348                "Failed to bind mount rootfs {:?} -> {:?}: {}",
349                source, target, e
350            ))
351        })?;
352
353        // Remount read-only
354        mount(
355            None::<&str>,
356            &target,
357            None::<&str>,
358            MsFlags::MS_REMOUNT
359                | MsFlags::MS_BIND
360                | MsFlags::MS_RDONLY
361                | MsFlags::MS_REC
362                | MsFlags::MS_NOSUID
363                | MsFlags::MS_NODEV,
364            None::<&str>,
365        )
366        .map_err(|e| {
367            NucleusError::FilesystemError(format!(
368                "Failed to remount rootfs {:?} read-only: {}",
369                target, e
370            ))
371        })?;
372
373        info!("Mounted rootfs/{} read-only", subdir);
374    }
375
376    Ok(())
377}
378
379/// Bind mount essential host directories into container
380///
381/// This allows host binaries to be accessible inside the container.
382/// Used in agent mode. Production mode should use bind_mount_rootfs() instead.
383pub fn bind_mount_host_paths(root: &Path, best_effort: bool) -> Result<()> {
384    info!("Bind mounting host paths into container");
385
386    // Essential paths to bind mount (read-only)
387    let host_paths = vec![
388        "/bin", "/usr", "/lib", "/lib64", "/nix", // For NixOS
389    ];
390
391    for host_path in host_paths {
392        let host = Path::new(host_path);
393
394        // Only mount if the path exists on the host
395        if !host.exists() {
396            debug!("Skipping {} (not present on host)", host_path);
397            continue;
398        }
399
400        let container_path = root.join(host_path.trim_start_matches('/'));
401
402        // Create mount point
403        if let Err(e) = std::fs::create_dir_all(&container_path) {
404            if best_effort {
405                warn!("Failed to create mount point {:?}: {}", container_path, e);
406                continue;
407            }
408            return Err(NucleusError::FilesystemError(format!(
409                "Failed to create mount point {:?}: {}",
410                container_path, e
411            )));
412        }
413
414        // Attempt bind mount
415        // Note: Linux ignores MS_RDONLY on the initial bind mount call.
416        // A second remount is required to actually enforce read-only.
417        match mount(
418            Some(host),
419            &container_path,
420            None::<&str>,
421            MsFlags::MS_BIND | MsFlags::MS_REC,
422            None::<&str>,
423        ) {
424            Ok(_) => {
425                // Remount as read-only – required because MS_RDONLY is ignored on initial bind
426                mount(
427                    None::<&str>,
428                    &container_path,
429                    None::<&str>,
430                    MsFlags::MS_REMOUNT
431                        | MsFlags::MS_BIND
432                        | MsFlags::MS_RDONLY
433                        | MsFlags::MS_REC
434                        | MsFlags::MS_NOSUID
435                        | MsFlags::MS_NODEV,
436                    None::<&str>,
437                )
438                .map_err(|e| {
439                    NucleusError::FilesystemError(format!(
440                        "Failed to remount {} as read-only: {}",
441                        host_path, e
442                    ))
443                })?;
444                info!(
445                    "Bind mounted {} to {:?} (read-only)",
446                    host_path, container_path
447                );
448            }
449            Err(e) => {
450                if best_effort {
451                    warn!(
452                        "Failed to bind mount {}: {} (continuing anyway)",
453                        host_path, e
454                    );
455                } else {
456                    return Err(NucleusError::FilesystemError(format!(
457                        "Failed to bind mount {}: {}",
458                        host_path, e
459                    )));
460                }
461            }
462        }
463    }
464
465    Ok(())
466}
467
468/// H7: Sensitive host paths that must not be bind-mounted into containers.
469const DENIED_BIND_MOUNT_SOURCES: &[&str] = &[
470    "/",
471    "/proc",
472    "/sys",
473    "/dev",
474    "/boot",
475    "/etc/shadow",
476    "/etc/sudoers",
477    "/etc/passwd",
478    "/etc/gshadow",
479];
480
481/// Validate that a bind mount source is not a sensitive host path.
482fn validate_bind_mount_source(source: &Path) -> Result<()> {
483    let source_str = source.to_string_lossy();
484    for denied in DENIED_BIND_MOUNT_SOURCES {
485        if source_str == *denied {
486            return Err(NucleusError::FilesystemError(format!(
487                "Bind mount source '{}' is a sensitive host path and cannot be mounted into containers",
488                source.display()
489            )));
490        }
491    }
492    Ok(())
493}
494
495/// Mount persistent bind volumes and ephemeral tmpfs volumes into the container root.
496pub fn mount_volumes(root: &Path, volumes: &[crate::container::VolumeMount]) -> Result<()> {
497    use crate::container::VolumeSource;
498
499    if volumes.is_empty() {
500        return Ok(());
501    }
502
503    info!("Mounting {} volume(s) into container", volumes.len());
504
505    for volume in volumes {
506        let dest = resolve_container_destination(root, &volume.dest)?;
507
508        match &volume.source {
509            VolumeSource::Bind { source } => {
510                // H7: Deny bind-mounting sensitive host paths
511                validate_bind_mount_source(source)?;
512
513                // Use symlink_metadata (lstat) instead of .exists() to avoid
514                // following symlinks in the existence check (O_NOFOLLOW semantics).
515                if std::fs::symlink_metadata(source).is_err() {
516                    return Err(NucleusError::FilesystemError(format!(
517                        "Volume source does not exist: {:?}",
518                        source
519                    )));
520                }
521
522                if let Some(parent) = dest.parent() {
523                    std::fs::create_dir_all(parent).map_err(|e| {
524                        NucleusError::FilesystemError(format!(
525                            "Failed to create volume mount parent {:?}: {}",
526                            parent, e
527                        ))
528                    })?;
529                }
530
531                let recursive = source.is_dir();
532                if source.is_file() {
533                    std::fs::write(&dest, "").map_err(|e| {
534                        NucleusError::FilesystemError(format!(
535                            "Failed to create volume mount point {:?}: {}",
536                            dest, e
537                        ))
538                    })?;
539                } else {
540                    std::fs::create_dir_all(&dest).map_err(|e| {
541                        NucleusError::FilesystemError(format!(
542                            "Failed to create volume mount dir {:?}: {}",
543                            dest, e
544                        ))
545                    })?;
546                }
547
548                let initial_flags = if recursive {
549                    MsFlags::MS_BIND | MsFlags::MS_REC
550                } else {
551                    MsFlags::MS_BIND
552                };
553                mount(
554                    Some(source.as_path()),
555                    &dest,
556                    None::<&str>,
557                    initial_flags,
558                    None::<&str>,
559                )
560                .map_err(|e| {
561                    NucleusError::FilesystemError(format!(
562                        "Failed to bind mount volume {:?} -> {:?}: {}",
563                        source, dest, e
564                    ))
565                })?;
566
567                let mut remount_flags =
568                    MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_NOSUID | MsFlags::MS_NODEV;
569                if recursive {
570                    remount_flags |= MsFlags::MS_REC;
571                }
572                if volume.read_only {
573                    remount_flags |= MsFlags::MS_RDONLY;
574                }
575
576                mount(
577                    None::<&str>,
578                    &dest,
579                    None::<&str>,
580                    remount_flags,
581                    None::<&str>,
582                )
583                .map_err(|e| {
584                    NucleusError::FilesystemError(format!(
585                        "Failed to remount volume {:?} with final flags: {}",
586                        dest, e
587                    ))
588                })?;
589
590                info!(
591                    "Mounted bind volume {:?} -> {:?} ({})",
592                    source,
593                    volume.dest,
594                    if volume.read_only { "ro" } else { "rw" }
595                );
596            }
597            VolumeSource::Tmpfs { size } => {
598                std::fs::create_dir_all(&dest).map_err(|e| {
599                    NucleusError::FilesystemError(format!(
600                        "Failed to create tmpfs mount dir {:?}: {}",
601                        dest, e
602                    ))
603                })?;
604
605                // M8: Validate size parameter to prevent option injection.
606                // Only allow digits, optionally followed by K/M/G suffix.
607                if let Some(value) = size.as_ref() {
608                    let valid = value
609                        .chars()
610                        .all(|c| c.is_ascii_digit() || "kKmMgG".contains(c));
611                    if !valid || value.is_empty() {
612                        return Err(NucleusError::FilesystemError(format!(
613                            "Invalid tmpfs size value '{}': only digits with optional K/M/G suffix allowed",
614                            value
615                        )));
616                    }
617                }
618
619                // M7: Default to 64MB instead of half of physical RAM to
620                // prevent memory DoS from unbounded tmpfs volumes.
621                let mount_data = size
622                    .as_ref()
623                    .map(|value| format!("size={},mode=0700", value))
624                    .unwrap_or_else(|| "size=64M,mode=0700".to_string());
625
626                let mut flags = MsFlags::MS_NOSUID | MsFlags::MS_NODEV;
627                if volume.read_only {
628                    flags |= MsFlags::MS_RDONLY;
629                }
630                mount(
631                    Some("tmpfs"),
632                    &dest,
633                    Some("tmpfs"),
634                    flags,
635                    Some(mount_data.as_str()),
636                )
637                .map_err(|e| {
638                    NucleusError::FilesystemError(format!(
639                        "Failed to mount tmpfs volume at {:?}: {}",
640                        dest, e
641                    ))
642                })?;
643
644                info!(
645                    "Mounted tmpfs volume at {:?}{}{}",
646                    volume.dest,
647                    size.as_ref()
648                        .map(|value| format!(" (size={})", value))
649                        .unwrap_or_default(),
650                    if volume.read_only { " (ro)" } else { "" }
651                );
652            }
653        }
654    }
655
656    Ok(())
657}
658
659/// Mount procfs at the given path
660///
661/// In rootless mode, procfs mounting should work due to user namespace capabilities.
662/// When `hide_pids` is true, mounts with hidepid=2 so processes cannot enumerate
663/// other PIDs (production hardening).
664pub fn mount_procfs(
665    proc_path: &Path,
666    best_effort: bool,
667    read_only: bool,
668    hide_pids: bool,
669) -> Result<()> {
670    info!(
671        "Mounting procfs at {:?} (hidepid={})",
672        proc_path,
673        if hide_pids { "2" } else { "0" }
674    );
675
676    let mount_data: Option<&str> = if hide_pids { Some("hidepid=2") } else { None };
677
678    match mount(
679        Some("proc"),
680        proc_path,
681        Some("proc"),
682        MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
683        mount_data,
684    ) {
685        Ok(_) => {
686            if read_only {
687                mount(
688                    None::<&str>,
689                    proc_path,
690                    None::<&str>,
691                    MsFlags::MS_REMOUNT
692                        | MsFlags::MS_RDONLY
693                        | MsFlags::MS_NOSUID
694                        | MsFlags::MS_NODEV
695                        | MsFlags::MS_NOEXEC,
696                    None::<&str>,
697                )
698                .map_err(|e| {
699                    NucleusError::FilesystemError(format!(
700                        "Failed to remount procfs read-only: {}",
701                        e
702                    ))
703                })?;
704                info!("Successfully mounted procfs (read-only)");
705            } else {
706                info!("Successfully mounted procfs");
707            }
708            Ok(())
709        }
710        Err(e) => {
711            if best_effort {
712                warn!("Failed to mount procfs: {} (continuing anyway)", e);
713                Ok(())
714            } else {
715                Err(NucleusError::FilesystemError(format!(
716                    "Failed to mount procfs: {}",
717                    e
718                )))
719            }
720        }
721    }
722}
723
724/// Paths to mask with /dev/null (files) — matches OCI runtime spec masked paths.
725/// Exposed for testing; the canonical list of sensitive /proc entries that must
726/// be hidden from container processes.
727pub const PROC_NULL_MASKED: &[&str] = &[
728    "kallsyms",
729    "kcore",
730    "sched_debug",
731    "timer_list",
732    "timer_stats",
733    "keys",
734    "latency_stats",
735    "config.gz",
736    "sysrq-trigger",
737    "kpagecount",
738    "kpageflags",
739    "kpagecgroup",
740];
741
742/// Paths to mask with empty tmpfs (directories).
743pub const PROC_TMPFS_MASKED: &[&str] = &["acpi", "bus", "irq", "scsi", "sys"];
744
745/// Mask sensitive /proc paths by bind-mounting /dev/null or tmpfs over them
746///
747/// This reduces kernel information leakage from the container. Follows OCI runtime
748/// conventions for masked paths.
749///
750/// SEC-06: When `production` is true, failures to mask critical paths
751/// (kcore, kallsyms, sysrq-trigger) are fatal instead of warn-and-continue.
752pub fn mask_proc_paths(proc_path: &Path, production: bool) -> Result<()> {
753    info!("Masking sensitive /proc paths");
754
755    const CRITICAL_PROC_PATHS: &[&str] = &["kcore", "kallsyms", "sysrq-trigger"];
756
757    let dev_null = Path::new("/dev/null");
758
759    for name in PROC_NULL_MASKED {
760        let target = proc_path.join(name);
761        if !target.exists() {
762            continue;
763        }
764        match mount(
765            Some(dev_null),
766            &target,
767            None::<&str>,
768            MsFlags::MS_BIND,
769            None::<&str>,
770        ) {
771            Ok(_) => {
772                // Remount read-only: Linux ignores MS_RDONLY on the initial bind mount,
773                // so a separate MS_REMOUNT|MS_BIND|MS_RDONLY call is required.
774                if let Err(e) = mount(
775                    None::<&str>,
776                    &target,
777                    None::<&str>,
778                    MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_RDONLY,
779                    None::<&str>,
780                ) {
781                    if production && CRITICAL_PROC_PATHS.contains(name) {
782                        return Err(NucleusError::FilesystemError(format!(
783                            "Failed to remount /proc/{} read-only in production mode: {}",
784                            name, e
785                        )));
786                    }
787                    warn!(
788                        "Failed to remount /proc/{} read-only: {} (continuing)",
789                        name, e
790                    );
791                }
792                debug!("Masked /proc/{} (read-only)", name);
793            }
794            Err(e) => {
795                if production && CRITICAL_PROC_PATHS.contains(name) {
796                    return Err(NucleusError::FilesystemError(format!(
797                        "Failed to mask critical /proc/{} in production mode: {}",
798                        name, e
799                    )));
800                }
801                warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
802            }
803        }
804    }
805
806    for name in PROC_TMPFS_MASKED {
807        let target = proc_path.join(name);
808        if !target.exists() {
809            continue;
810        }
811        match mount(
812            Some("tmpfs"),
813            &target,
814            Some("tmpfs"),
815            MsFlags::MS_RDONLY | MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
816            Some("size=0"),
817        ) {
818            Ok(_) => debug!("Masked /proc/{}", name),
819            Err(e) => {
820                if production {
821                    return Err(NucleusError::FilesystemError(format!(
822                        "Failed to mask /proc/{} in production mode: {}",
823                        name, e
824                    )));
825                }
826                warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
827            }
828        }
829    }
830
831    info!("Finished masking sensitive /proc paths");
832    Ok(())
833}
834
835/// Switch to new root filesystem using pivot_root or chroot
836///
837/// This implements the transition: populated -> pivoted
838/// Fails closed if root switching cannot be established.
839pub fn switch_root(new_root: &Path, allow_chroot_fallback: bool) -> Result<()> {
840    info!("Switching root to {:?}", new_root);
841
842    match pivot_root_impl(new_root) {
843        Ok(()) => {
844            info!("Successfully switched root using pivot_root");
845            Ok(())
846        }
847        Err(e) => {
848            if allow_chroot_fallback {
849                warn!(
850                    "pivot_root failed ({}), falling back to chroot due to explicit \
851                     configuration",
852                    e
853                );
854                chroot_impl(new_root)
855            } else {
856                Err(NucleusError::PivotRootError(format!(
857                    "pivot_root failed: {}. chroot fallback is disabled by default; use \
858                     --allow-chroot-fallback to allow weaker isolation",
859                    e
860                )))
861            }
862        }
863    }
864}
865
866/// Implement root switch using pivot_root(2)
867///
868/// pivot_root is preferred over chroot because:
869/// - More secure (old root can be unmounted)
870/// - Works better with mount namespaces
871fn pivot_root_impl(new_root: &Path) -> Result<()> {
872    use nix::unistd::pivot_root;
873
874    // pivot_root requires new_root to be a mount point
875    // and old_root to be under new_root
876
877    let old_root = new_root.join(".old_root");
878    std::fs::create_dir_all(&old_root).map_err(|e| {
879        NucleusError::PivotRootError(format!("Failed to create old_root directory: {}", e))
880    })?;
881
882    // Perform pivot_root
883    pivot_root(new_root, &old_root)
884        .map_err(|e| NucleusError::PivotRootError(format!("pivot_root syscall failed: {}", e)))?;
885
886    // Change to new root
887    std::env::set_current_dir("/")
888        .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
889
890    // Unmount old root
891    nix::mount::umount2("/.old_root", nix::mount::MntFlags::MNT_DETACH)
892        .map_err(|e| NucleusError::PivotRootError(format!("Failed to unmount old root: {}", e)))?;
893
894    // Remove old root directory
895    let _ = std::fs::remove_dir("/.old_root");
896
897    Ok(())
898}
899
900/// Implement root switch using chroot(2)
901///
902/// chroot is less secure than pivot_root but works in more situations
903fn chroot_impl(new_root: &Path) -> Result<()> {
904    chroot(new_root)
905        .map_err(|e| NucleusError::PivotRootError(format!("chroot syscall failed: {}", e)))?;
906
907    // Change to new root
908    std::env::set_current_dir("/")
909        .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
910
911    // L3: Drop CAP_SYS_CHROOT after chroot to prevent escape via nested chroot.
912    // Also close any FDs pointing outside the new root.
913    if let Err(e) = caps::drop(
914        None,
915        caps::CapSet::Bounding,
916        caps::Capability::CAP_SYS_CHROOT,
917    ) {
918        debug!(
919            "Could not drop CAP_SYS_CHROOT after chroot: {} (may not be present)",
920            e
921        );
922    }
923    if let Err(e) = caps::drop(
924        None,
925        caps::CapSet::Effective,
926        caps::Capability::CAP_SYS_CHROOT,
927    ) {
928        debug!(
929            "Could not drop effective CAP_SYS_CHROOT: {} (may not be present)",
930            e
931        );
932    }
933    if let Err(e) = caps::drop(
934        None,
935        caps::CapSet::Permitted,
936        caps::Capability::CAP_SYS_CHROOT,
937    ) {
938        debug!(
939            "Could not drop permitted CAP_SYS_CHROOT: {} (may not be present)",
940            e
941        );
942    }
943
944    info!("Successfully switched root using chroot (CAP_SYS_CHROOT dropped)");
945
946    Ok(())
947}
948
949/// Mount secret files into the container root.
950///
951/// Each secret is bind-mounted read-only from its source to the destination
952/// path inside the container. Intermediate directories are created as needed.
953pub fn mount_secrets(root: &Path, secrets: &[crate::container::SecretMount]) -> Result<()> {
954    if secrets.is_empty() {
955        return Ok(());
956    }
957
958    info!("Mounting {} secret(s) into container", secrets.len());
959
960    for secret in secrets {
961        let source_fd = open(
962            &secret.source,
963            OFlag::O_PATH | OFlag::O_NOFOLLOW | OFlag::O_CLOEXEC,
964            Mode::empty(),
965        )
966        .map_err(|e| {
967            NucleusError::FilesystemError(format!(
968                "Failed to open secret source {:?} with O_NOFOLLOW: {}",
969                secret.source, e
970            ))
971        })?;
972        let source_stat = fstat(&source_fd).map_err(|e| {
973            NucleusError::FilesystemError(format!(
974                "Failed to stat secret source {:?}: {}",
975                secret.source, e
976            ))
977        })?;
978        let source_kind = SFlag::from_bits_truncate(source_stat.st_mode);
979        let source_is_file = source_kind == SFlag::S_IFREG;
980        let source_is_dir = source_kind == SFlag::S_IFDIR;
981        if !source_is_file && !source_is_dir {
982            return Err(NucleusError::FilesystemError(format!(
983                "Secret source {:?} must be a regular file or directory",
984                secret.source
985            )));
986        }
987        let source_fd_path = PathBuf::from(format!("/proc/self/fd/{}", source_fd.as_raw_fd()));
988
989        // Destination inside container root
990        let dest = resolve_container_destination(root, &secret.dest)?;
991
992        // Create parent directories
993        if let Some(parent) = dest.parent() {
994            std::fs::create_dir_all(parent).map_err(|e| {
995                NucleusError::FilesystemError(format!(
996                    "Failed to create secret mount parent {:?}: {}",
997                    parent, e
998                ))
999            })?;
1000        }
1001
1002        // Create mount point file
1003        if source_is_file {
1004            std::fs::write(&dest, "").map_err(|e| {
1005                NucleusError::FilesystemError(format!(
1006                    "Failed to create secret mount point {:?}: {}",
1007                    dest, e
1008                ))
1009            })?;
1010        } else {
1011            std::fs::create_dir_all(&dest).map_err(|e| {
1012                NucleusError::FilesystemError(format!(
1013                    "Failed to create secret mount dir {:?}: {}",
1014                    dest, e
1015                ))
1016            })?;
1017        }
1018
1019        // Bind mount read-only
1020        mount(
1021            Some(source_fd_path.as_path()),
1022            &dest,
1023            None::<&str>,
1024            MsFlags::MS_BIND,
1025            None::<&str>,
1026        )
1027        .map_err(|e| {
1028            NucleusError::FilesystemError(format!(
1029                "Failed to bind mount secret {:?}: {}",
1030                secret.source, e
1031            ))
1032        })?;
1033
1034        mount(
1035            None::<&str>,
1036            &dest,
1037            None::<&str>,
1038            MsFlags::MS_REMOUNT
1039                | MsFlags::MS_BIND
1040                | MsFlags::MS_RDONLY
1041                | MsFlags::MS_NOSUID
1042                | MsFlags::MS_NODEV
1043                | MsFlags::MS_NOEXEC,
1044            None::<&str>,
1045        )
1046        .map_err(|e| {
1047            NucleusError::FilesystemError(format!(
1048                "Failed to remount secret {:?} read-only: {}",
1049                dest, e
1050            ))
1051        })?;
1052
1053        // Apply configured file permissions on the mount point
1054        if source_is_file {
1055            use std::os::unix::fs::PermissionsExt;
1056            let perms = std::fs::Permissions::from_mode(secret.mode);
1057            if let Err(e) = std::fs::set_permissions(&dest, perms) {
1058                warn!(
1059                    "Failed to set mode {:04o} on secret {:?}: {} (bind mount may override)",
1060                    secret.mode, dest, e
1061                );
1062            }
1063        }
1064
1065        debug!(
1066            "Mounted secret {:?} -> {:?} (mode {:04o})",
1067            secret.source, secret.dest, secret.mode
1068        );
1069    }
1070
1071    Ok(())
1072}
1073
1074/// Mount secrets onto a dedicated in-memory tmpfs instead of bind-mounting host paths.
1075///
1076/// Creates a per-container tmpfs at `<root>/run/secrets` with MS_NOEXEC | MS_NOSUID | MS_NODEV,
1077/// copies secret contents into it, then zeros the read buffer. This ensures secrets
1078/// never reference host-side files after setup and are never persisted to disk.
1079pub fn mount_secrets_inmemory(
1080    root: &Path,
1081    secrets: &[crate::container::SecretMount],
1082    identity: &crate::container::ProcessIdentity,
1083) -> Result<()> {
1084    if secrets.is_empty() {
1085        return Ok(());
1086    }
1087
1088    info!("Mounting {} secret(s) on in-memory tmpfs", secrets.len());
1089
1090    let secrets_dir = root.join("run/secrets");
1091    std::fs::create_dir_all(&secrets_dir).map_err(|e| {
1092        NucleusError::FilesystemError(format!(
1093            "Failed to create secrets dir {:?}: {}",
1094            secrets_dir, e
1095        ))
1096    })?;
1097
1098    // Mount a size-limited tmpfs for secrets (16 MiB max)
1099    if let Err(e) = mount(
1100        Some("tmpfs"),
1101        &secrets_dir,
1102        Some("tmpfs"),
1103        MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
1104        Some("size=16m,mode=0700"),
1105    ) {
1106        let _ = std::fs::remove_dir_all(&secrets_dir);
1107        return Err(NucleusError::FilesystemError(format!(
1108            "Failed to mount secrets tmpfs at {:?}: {}",
1109            secrets_dir, e
1110        )));
1111    }
1112
1113    if !identity.is_root() {
1114        nix::unistd::chown(
1115            &secrets_dir,
1116            Some(nix::unistd::Uid::from_raw(identity.uid)),
1117            Some(nix::unistd::Gid::from_raw(identity.gid)),
1118        )
1119        .map_err(|e| {
1120            let _ = nix::mount::umount2(&secrets_dir, nix::mount::MntFlags::MNT_DETACH);
1121            let _ = std::fs::remove_dir_all(&secrets_dir);
1122            NucleusError::FilesystemError(format!(
1123                "Failed to set /run/secrets owner to {}:{}: {}",
1124                identity.uid, identity.gid, e
1125            ))
1126        })?;
1127    }
1128
1129    // Rollback: unmount tmpfs and remove dir if any secret fails
1130    let result = mount_secrets_inmemory_inner(&secrets_dir, root, secrets, identity);
1131    if let Err(ref e) = result {
1132        let _ = nix::mount::umount2(&secrets_dir, nix::mount::MntFlags::MNT_DETACH);
1133        let _ = std::fs::remove_dir_all(&secrets_dir);
1134        return Err(NucleusError::FilesystemError(format!(
1135            "Secret mount failed (rolled back): {}",
1136            e
1137        )));
1138    }
1139
1140    info!("All secrets mounted on in-memory tmpfs");
1141    Ok(())
1142}
1143
1144fn mount_secrets_inmemory_inner(
1145    secrets_dir: &Path,
1146    root: &Path,
1147    secrets: &[crate::container::SecretMount],
1148    identity: &crate::container::ProcessIdentity,
1149) -> Result<()> {
1150    for secret in secrets {
1151        let mut content = read_regular_file_nofollow(&secret.source)?;
1152
1153        // Determine destination path inside the secrets tmpfs
1154        let dest = resolve_container_destination(secrets_dir, &secret.dest)?;
1155
1156        // Create parent directories within the tmpfs
1157        if let Some(parent) = dest.parent() {
1158            std::fs::create_dir_all(parent).map_err(|e| {
1159                NucleusError::FilesystemError(format!(
1160                    "Failed to create secret parent dir {:?}: {}",
1161                    parent, e
1162                ))
1163            })?;
1164        }
1165
1166        // Write secret content to tmpfs
1167        std::fs::write(&dest, &content).map_err(|e| {
1168            NucleusError::FilesystemError(format!("Failed to write secret to {:?}: {}", dest, e))
1169        })?;
1170
1171        // Set permissions
1172        {
1173            use std::os::unix::fs::PermissionsExt;
1174            let perms = std::fs::Permissions::from_mode(secret.mode);
1175            std::fs::set_permissions(&dest, perms).map_err(|e| {
1176                NucleusError::FilesystemError(format!(
1177                    "Failed to set permissions on secret {:?}: {}",
1178                    dest, e
1179                ))
1180            })?;
1181        }
1182
1183        if !identity.is_root() {
1184            nix::unistd::chown(
1185                &dest,
1186                Some(nix::unistd::Uid::from_raw(identity.uid)),
1187                Some(nix::unistd::Gid::from_raw(identity.gid)),
1188            )
1189            .map_err(|e| {
1190                NucleusError::FilesystemError(format!(
1191                    "Failed to set permissions owner on secret {:?} to {}:{}: {}",
1192                    dest, identity.uid, identity.gid, e
1193                ))
1194            })?;
1195        }
1196
1197        // Zero the in-memory buffer
1198        zeroize::Zeroize::zeroize(&mut content);
1199        drop(content);
1200
1201        // Also bind-mount the secret to its expected container path for compatibility
1202        let container_dest = resolve_container_destination(root, &secret.dest)?;
1203        if container_dest != dest {
1204            if let Some(parent) = container_dest.parent() {
1205                std::fs::create_dir_all(parent).map_err(|e| {
1206                    NucleusError::FilesystemError(format!(
1207                        "Failed to create secret mount parent {:?}: {}",
1208                        parent, e
1209                    ))
1210                })?;
1211            }
1212
1213            std::fs::write(&container_dest, "").map_err(|e| {
1214                NucleusError::FilesystemError(format!(
1215                    "Failed to create secret mount point {:?}: {}",
1216                    container_dest, e
1217                ))
1218            })?;
1219
1220            mount(
1221                Some(dest.as_path()),
1222                &container_dest,
1223                None::<&str>,
1224                MsFlags::MS_BIND,
1225                None::<&str>,
1226            )
1227            .map_err(|e| {
1228                NucleusError::FilesystemError(format!(
1229                    "Failed to bind mount secret {:?} -> {:?}: {}",
1230                    dest, container_dest, e
1231                ))
1232            })?;
1233
1234            mount(
1235                None::<&str>,
1236                &container_dest,
1237                None::<&str>,
1238                MsFlags::MS_REMOUNT
1239                    | MsFlags::MS_BIND
1240                    | MsFlags::MS_RDONLY
1241                    | MsFlags::MS_NOSUID
1242                    | MsFlags::MS_NODEV
1243                    | MsFlags::MS_NOEXEC,
1244                None::<&str>,
1245            )
1246            .map_err(|e| {
1247                NucleusError::FilesystemError(format!(
1248                    "Failed to remount secret {:?} read-only: {}",
1249                    container_dest, e
1250                ))
1251            })?;
1252        }
1253
1254        debug!(
1255            "Secret {:?} -> {:?} (in-memory tmpfs, mode {:04o})",
1256            secret.source, secret.dest, secret.mode
1257        );
1258    }
1259
1260    Ok(())
1261}
1262
1263#[cfg(test)]
1264mod tests {
1265    use super::*;
1266    use std::os::unix::fs::symlink;
1267
1268    #[test]
1269    fn test_proc_mask_includes_sysrq_trigger() {
1270        assert!(
1271            PROC_NULL_MASKED.contains(&"sysrq-trigger"),
1272            "/proc/sysrq-trigger must be masked to prevent host DoS"
1273        );
1274    }
1275
1276    #[test]
1277    fn test_proc_mask_includes_timer_stats() {
1278        assert!(
1279            PROC_NULL_MASKED.contains(&"timer_stats"),
1280            "/proc/timer_stats must be masked to prevent kernel info leakage"
1281        );
1282    }
1283
1284    #[test]
1285    fn test_proc_mask_includes_kpage_files() {
1286        for path in &["kpagecount", "kpageflags", "kpagecgroup"] {
1287            assert!(
1288                PROC_NULL_MASKED.contains(path),
1289                "/proc/{} must be masked to prevent host memory layout leakage",
1290                path
1291            );
1292        }
1293    }
1294
1295    #[test]
1296    fn test_proc_mask_includes_oci_standard_paths() {
1297        // OCI runtime spec required masked paths
1298        for path in &["kallsyms", "kcore", "sched_debug", "keys", "config.gz"] {
1299            assert!(
1300                PROC_NULL_MASKED.contains(path),
1301                "/proc/{} must be in null-masked list (OCI spec)",
1302                path
1303            );
1304        }
1305        for path in &["acpi", "bus", "scsi", "sys"] {
1306            assert!(
1307                PROC_TMPFS_MASKED.contains(path),
1308                "/proc/{} must be in tmpfs-masked list (OCI spec)",
1309                path
1310            );
1311        }
1312    }
1313
1314    #[test]
1315    fn test_read_regular_file_nofollow_reads_regular_file() {
1316        let temp = tempfile::tempdir().unwrap();
1317        let path = temp.path().join("secret.txt");
1318        std::fs::write(&path, "supersecret").unwrap();
1319
1320        let content = read_regular_file_nofollow(&path).unwrap();
1321        assert_eq!(content, b"supersecret");
1322    }
1323
1324    #[test]
1325    fn test_read_regular_file_nofollow_rejects_symlink() {
1326        let temp = tempfile::tempdir().unwrap();
1327        let target = temp.path().join("target.txt");
1328        let link = temp.path().join("secret-link");
1329        std::fs::write(&target, "supersecret").unwrap();
1330        symlink(&target, &link).unwrap();
1331
1332        let err = read_regular_file_nofollow(&link).unwrap_err();
1333        assert!(
1334            err.to_string().contains("O_NOFOLLOW"),
1335            "symlink reads must fail via O_NOFOLLOW"
1336        );
1337    }
1338}