Skip to main content

nucleus/filesystem/
mount.rs

1use crate::error::{NucleusError, Result};
2use nix::fcntl::{open, OFlag};
3use nix::mount::{mount, MsFlags};
4use nix::sys::stat::{fstat, makedev, mknod, Mode, SFlag};
5use nix::unistd::chroot;
6use std::fs::OpenOptions;
7use std::io::Read;
8use std::os::fd::AsRawFd;
9use std::os::unix::fs::OpenOptionsExt;
10use std::path::{Component, Path, PathBuf};
11use tracing::{debug, info, warn};
12
13/// Expected mount flags for audit verification.
14struct ExpectedMount {
15    path: &'static str,
16    required_flags: &'static [&'static str],
17    /// If true, the mount *must* exist in production mode. A missing critical
18    /// mount (e.g. /proc) is treated as a violation rather than silently skipped.
19    critical: bool,
20}
21
22/// Known mount paths and the flags they must carry in production mode.
23const PRODUCTION_MOUNT_EXPECTATIONS: &[ExpectedMount] = &[
24    ExpectedMount {
25        path: "/bin",
26        required_flags: &["ro", "nosuid", "nodev"],
27        critical: true,
28    },
29    ExpectedMount {
30        path: "/usr",
31        required_flags: &["ro", "nosuid", "nodev"],
32        critical: true,
33    },
34    ExpectedMount {
35        path: "/lib",
36        required_flags: &["ro", "nosuid", "nodev"],
37        critical: false, // not all rootfs layouts have /lib
38    },
39    ExpectedMount {
40        path: "/lib64",
41        required_flags: &["ro", "nosuid", "nodev"],
42        critical: false, // not all rootfs layouts have /lib64
43    },
44    ExpectedMount {
45        path: "/etc",
46        required_flags: &["ro", "nosuid", "nodev"],
47        critical: true,
48    },
49    ExpectedMount {
50        path: "/nix",
51        required_flags: &["ro", "nosuid", "nodev"],
52        critical: false, // only present on NixOS-based rootfs
53    },
54    ExpectedMount {
55        path: "/sbin",
56        required_flags: &["ro", "nosuid", "nodev"],
57        critical: false, // not all rootfs layouts have /sbin
58    },
59    ExpectedMount {
60        path: "/proc",
61        required_flags: &["nosuid", "nodev", "noexec"],
62        critical: true,
63    },
64    ExpectedMount {
65        path: "/run/secrets",
66        required_flags: &["nosuid", "nodev", "noexec"],
67        critical: false, // only present when secrets are configured
68    },
69];
70
71/// Normalize an absolute container destination path and reject traversal.
72///
73/// Returns a normalized absolute path containing only `RootDir` and `Normal`
74/// components. `.` segments are ignored; `..` and relative paths are rejected.
75pub fn normalize_container_destination(dest: &Path) -> Result<PathBuf> {
76    if !dest.is_absolute() {
77        return Err(NucleusError::ConfigError(format!(
78            "Container destination must be absolute: {:?}",
79            dest
80        )));
81    }
82
83    let mut normalized = PathBuf::from("/");
84    let mut saw_component = false;
85
86    for component in dest.components() {
87        match component {
88            Component::RootDir => {}
89            Component::CurDir => {}
90            Component::Normal(part) => {
91                normalized.push(part);
92                saw_component = true;
93            }
94            Component::ParentDir => {
95                return Err(NucleusError::ConfigError(format!(
96                    "Container destination must not contain parent traversal: {:?}",
97                    dest
98                )));
99            }
100            Component::Prefix(_) => {
101                return Err(NucleusError::ConfigError(format!(
102                    "Unsupported container destination prefix: {:?}",
103                    dest
104                )));
105            }
106        }
107    }
108
109    if !saw_component {
110        return Err(NucleusError::ConfigError(format!(
111            "Container destination must not be the root directory: {:?}",
112            dest
113        )));
114    }
115
116    Ok(normalized)
117}
118
119/// Resolve a validated container destination under a host-side root directory.
120pub fn resolve_container_destination(root: &Path, dest: &Path) -> Result<PathBuf> {
121    let normalized = normalize_container_destination(dest)?;
122    let relative = normalized.strip_prefix("/").map_err(|_| {
123        NucleusError::ConfigError(format!(
124            "Container destination is not absolute after normalization: {:?}",
125            normalized
126        ))
127    })?;
128    Ok(root.join(relative))
129}
130
131fn validate_rootfs_path_under_store(rootfs_path: &Path, store_root: &Path) -> Result<PathBuf> {
132    if !rootfs_path.is_absolute() {
133        return Err(NucleusError::ConfigError(format!(
134            "Production rootfs path must be absolute: {}",
135            rootfs_path.display()
136        )));
137    }
138
139    for component in rootfs_path.components() {
140        match component {
141            Component::ParentDir => {
142                return Err(NucleusError::ConfigError(format!(
143                    "Production rootfs path must not contain parent traversal: {}",
144                    rootfs_path.display()
145                )));
146            }
147            Component::Prefix(_) => {
148                return Err(NucleusError::ConfigError(format!(
149                    "Unsupported production rootfs path prefix: {}",
150                    rootfs_path.display()
151                )));
152            }
153            Component::RootDir | Component::CurDir | Component::Normal(_) => {}
154        }
155    }
156
157    let canonical = std::fs::canonicalize(rootfs_path).map_err(|e| {
158        NucleusError::ConfigError(format!(
159            "Failed to canonicalize production rootfs path '{}': {}",
160            rootfs_path.display(),
161            e
162        ))
163    })?;
164
165    if !canonical.starts_with(store_root) {
166        return Err(NucleusError::ConfigError(format!(
167            "Production mode requires rootfs path to resolve under {}: {} -> {}",
168            store_root.display(),
169            rootfs_path.display(),
170            canonical.display()
171        )));
172    }
173
174    if !canonical.is_dir() {
175        return Err(NucleusError::ConfigError(format!(
176            "Production rootfs path must resolve to a directory: {}",
177            canonical.display()
178        )));
179    }
180
181    Ok(canonical)
182}
183
184/// Validate a production rootfs path and return the canonical path to use.
185///
186/// Production rootfs paths must not contain parent traversal, and the resolved
187/// target must be a directory under the immutable Nix store.
188pub fn validate_production_rootfs_path(rootfs_path: &Path) -> Result<PathBuf> {
189    validate_rootfs_path_under_store(rootfs_path, Path::new("/nix/store"))
190}
191
192pub(crate) fn read_regular_file_nofollow(path: &Path) -> Result<Vec<u8>> {
193    let mut file = OpenOptions::new()
194        .read(true)
195        .custom_flags(libc::O_NOFOLLOW | libc::O_CLOEXEC)
196        .open(path)
197        .map_err(|e| {
198            NucleusError::FilesystemError(format!(
199                "Failed to open file {:?} with O_NOFOLLOW: {}",
200                path, e
201            ))
202        })?;
203
204    let metadata = file.metadata().map_err(|e| {
205        NucleusError::FilesystemError(format!("Failed to stat file {:?}: {}", path, e))
206    })?;
207    if !metadata.is_file() {
208        return Err(NucleusError::FilesystemError(format!(
209            "Expected regular file for {:?}, found non-file source",
210            path
211        )));
212    }
213
214    let mut content = Vec::new();
215    file.read_to_end(&mut content).map_err(|e| {
216        NucleusError::FilesystemError(format!("Failed to read file {:?}: {}", path, e))
217    })?;
218    Ok(content)
219}
220
221fn decode_mountinfo_field(field: &str) -> String {
222    let mut decoded = String::with_capacity(field.len());
223    let mut chars = field.chars().peekable();
224
225    while let Some(ch) = chars.next() {
226        if ch == '\\' {
227            let code: String = chars.by_ref().take(3).collect();
228            match code.as_str() {
229                "040" => decoded.push(' '),
230                "011" => decoded.push('\t'),
231                "012" => decoded.push('\n'),
232                "134" => decoded.push('\\'),
233                _ => {
234                    decoded.push('\\');
235                    decoded.push_str(&code);
236                }
237            }
238        } else {
239            decoded.push(ch);
240        }
241    }
242
243    decoded
244}
245
246fn parse_mountinfo_line(line: &str) -> Option<(String, std::collections::HashSet<String>)> {
247    let (left, _) = line.split_once(" - ")?;
248    let fields: Vec<&str> = left.split_whitespace().collect();
249    if fields.len() < 6 {
250        return None;
251    }
252
253    let mount_point = decode_mountinfo_field(fields[4]);
254    let options = fields[5]
255        .split(',')
256        .map(str::trim)
257        .filter(|opt| !opt.is_empty())
258        .map(str::to_string)
259        .collect();
260
261    Some((mount_point, options))
262}
263
264/// Audit all mounts in the container's mount namespace.
265///
266/// Reads `/proc/self/mountinfo` and verifies that each known mount point carries
267/// its expected per-mount flags. In production mode, any missing flag is fatal.
268/// Returns Ok(()) if all checks pass, or a list of violations.
269pub fn audit_mounts(production_mode: bool) -> Result<()> {
270    let mounts_content = std::fs::read_to_string("/proc/self/mountinfo").map_err(|e| {
271        NucleusError::FilesystemError(format!("Failed to read /proc/self/mountinfo: {}", e))
272    })?;
273    let mount_table: std::collections::HashMap<String, std::collections::HashSet<String>> =
274        mounts_content
275            .lines()
276            .filter_map(parse_mountinfo_line)
277            .collect();
278
279    let mut violations = Vec::new();
280
281    for expectation in PRODUCTION_MOUNT_EXPECTATIONS {
282        if let Some(options) = mount_table.get(expectation.path) {
283            for &flag in expectation.required_flags {
284                if !options.contains(flag) {
285                    let rendered = options
286                        .iter()
287                        .map(String::as_str)
288                        .collect::<Vec<_>>()
289                        .join(",");
290                    violations.push(format!(
291                        "Mount {} missing required flag '{}' (has: {})",
292                        expectation.path, flag, rendered
293                    ));
294                }
295            }
296        } else if expectation.critical && production_mode {
297            violations.push(format!(
298                "Critical mount {} is missing from the mount namespace",
299                expectation.path
300            ));
301        }
302    }
303
304    if violations.is_empty() {
305        info!("Mount audit passed: all expected flags verified");
306        Ok(())
307    } else if production_mode {
308        Err(NucleusError::FilesystemError(format!(
309            "Mount audit failed in production mode:\n  {}",
310            violations.join("\n  ")
311        )))
312    } else {
313        for v in &violations {
314            warn!("Mount audit: {}", v);
315        }
316        Ok(())
317    }
318}
319
320/// Create minimal filesystem structure in the new root
321pub fn create_minimal_fs(root: &Path) -> Result<()> {
322    info!("Creating minimal filesystem structure at {:?}", root);
323
324    // Create essential directories
325    let dirs = vec![
326        "dev",
327        "proc",
328        "sys",
329        "tmp",
330        "bin",
331        "sbin",
332        "usr",
333        "lib",
334        "lib64",
335        "etc",
336        "nix",
337        "nix/store",
338        "run",
339        "context",
340    ];
341
342    for dir in dirs {
343        let path = root.join(dir);
344        std::fs::create_dir_all(&path).map_err(|e| {
345            NucleusError::FilesystemError(format!("Failed to create directory {:?}: {}", path, e))
346        })?;
347    }
348
349    info!("Created minimal filesystem structure");
350
351    Ok(())
352}
353
354/// Create essential device nodes in /dev
355///
356/// In rootless mode, device node creation will fail gracefully
357pub fn create_dev_nodes(dev_path: &Path, include_tty: bool) -> Result<()> {
358    info!("Creating device nodes at {:?}", dev_path);
359
360    // Device nodes: (name, type, major, minor)
361    let mut devices = vec![
362        ("null", SFlag::S_IFCHR, 1, 3),
363        ("zero", SFlag::S_IFCHR, 1, 5),
364        ("full", SFlag::S_IFCHR, 1, 7),
365        ("random", SFlag::S_IFCHR, 1, 8),
366        ("urandom", SFlag::S_IFCHR, 1, 9),
367    ];
368    if include_tty {
369        devices.push(("tty", SFlag::S_IFCHR, 5, 0));
370    }
371
372    let mut created_count = 0;
373    let mut failed_count = 0;
374
375    for (name, dev_type, major, minor) in devices {
376        let path = dev_path.join(name);
377        let mode = Mode::from_bits_truncate(0o660);
378        let dev = makedev(major, minor);
379
380        match mknod(&path, dev_type, mode, dev) {
381            Ok(_) => {
382                info!("Created device node: {:?}", path);
383                created_count += 1;
384            }
385            Err(e) => {
386                // In rootless mode, mknod fails - this is expected
387                warn!(
388                    "Failed to create device node {:?}: {} (this is normal in rootless mode)",
389                    path, e
390                );
391                failed_count += 1;
392            }
393        }
394    }
395
396    if created_count > 0 {
397        info!("Successfully created {} device nodes", created_count);
398    }
399    if failed_count > 0 {
400        info!("Skipped {} device nodes (rootless mode)", failed_count);
401    }
402
403    Ok(())
404}
405
406/// Bind mount a pre-built rootfs (e.g. a Nix store closure) into the container.
407///
408/// Instead of exposing the full host /bin, /usr, /lib, /lib64, /nix, this mounts
409/// a minimal, purpose-built root filesystem. Suitable for production services.
410pub fn bind_mount_rootfs(root: &Path, rootfs_path: &Path) -> Result<()> {
411    info!(
412        "Bind mounting production rootfs {:?} into container {:?}",
413        rootfs_path, root
414    );
415
416    if std::fs::symlink_metadata(rootfs_path).is_err() {
417        return Err(NucleusError::FilesystemError(format!(
418            "Rootfs path does not exist: {:?}",
419            rootfs_path
420        )));
421    }
422
423    // Bind mount the rootfs contents into the container root.
424    // The rootfs is expected to contain a standard FHS layout (/bin, /lib, /etc, etc.)
425    // produced by a Nix buildEnv or similar.
426    let subdirs = ["bin", "sbin", "lib", "lib64", "usr", "etc", "nix"];
427
428    for subdir in &subdirs {
429        let source = rootfs_path.join(subdir);
430        if !source.exists() {
431            debug!("Rootfs subdir {} not present, skipping", subdir);
432            continue;
433        }
434
435        let target = root.join(subdir);
436        std::fs::create_dir_all(&target).map_err(|e| {
437            NucleusError::FilesystemError(format!(
438                "Failed to create mount point {:?}: {}",
439                target, e
440            ))
441        })?;
442
443        mount(
444            Some(&source),
445            &target,
446            None::<&str>,
447            MsFlags::MS_BIND | MsFlags::MS_REC,
448            None::<&str>,
449        )
450        .map_err(|e| {
451            NucleusError::FilesystemError(format!(
452                "Failed to bind mount rootfs {:?} -> {:?}: {}",
453                source, target, e
454            ))
455        })?;
456
457        // Remount read-only
458        mount(
459            None::<&str>,
460            &target,
461            None::<&str>,
462            MsFlags::MS_REMOUNT
463                | MsFlags::MS_BIND
464                | MsFlags::MS_RDONLY
465                | MsFlags::MS_REC
466                | MsFlags::MS_NOSUID
467                | MsFlags::MS_NODEV,
468            None::<&str>,
469        )
470        .map_err(|e| {
471            NucleusError::FilesystemError(format!(
472                "Failed to remount rootfs {:?} read-only: {}",
473                target, e
474            ))
475        })?;
476
477        info!("Mounted rootfs/{} read-only", subdir);
478    }
479
480    Ok(())
481}
482
483/// Bind mount essential host directories into container
484///
485/// This allows host binaries to be accessible inside the container.
486/// Used in agent mode. Production mode should use bind_mount_rootfs() instead.
487pub fn bind_mount_host_paths(root: &Path, best_effort: bool) -> Result<()> {
488    info!("Bind mounting host paths into container");
489
490    // Essential paths to bind mount (read-only)
491    let host_paths = vec![
492        "/bin", "/usr", "/lib", "/lib64", "/nix", // For NixOS
493    ];
494
495    for host_path in host_paths {
496        let host = Path::new(host_path);
497
498        // Only mount if the path exists on the host
499        if !host.exists() {
500            debug!("Skipping {} (not present on host)", host_path);
501            continue;
502        }
503
504        let container_path = root.join(host_path.trim_start_matches('/'));
505
506        // Create mount point
507        if let Err(e) = std::fs::create_dir_all(&container_path) {
508            if best_effort {
509                warn!("Failed to create mount point {:?}: {}", container_path, e);
510                continue;
511            }
512            return Err(NucleusError::FilesystemError(format!(
513                "Failed to create mount point {:?}: {}",
514                container_path, e
515            )));
516        }
517
518        // Attempt bind mount
519        // Note: Linux ignores MS_RDONLY on the initial bind mount call.
520        // A second remount is required to actually enforce read-only.
521        match mount(
522            Some(host),
523            &container_path,
524            None::<&str>,
525            MsFlags::MS_BIND | MsFlags::MS_REC,
526            None::<&str>,
527        ) {
528            Ok(_) => {
529                // Remount as read-only – required because MS_RDONLY is ignored on initial bind
530                mount(
531                    None::<&str>,
532                    &container_path,
533                    None::<&str>,
534                    MsFlags::MS_REMOUNT
535                        | MsFlags::MS_BIND
536                        | MsFlags::MS_RDONLY
537                        | MsFlags::MS_REC
538                        | MsFlags::MS_NOSUID
539                        | MsFlags::MS_NODEV,
540                    None::<&str>,
541                )
542                .map_err(|e| {
543                    NucleusError::FilesystemError(format!(
544                        "Failed to remount {} as read-only: {}",
545                        host_path, e
546                    ))
547                })?;
548                info!(
549                    "Bind mounted {} to {:?} (read-only)",
550                    host_path, container_path
551                );
552            }
553            Err(e) => {
554                if best_effort {
555                    warn!(
556                        "Failed to bind mount {}: {} (continuing anyway)",
557                        host_path, e
558                    );
559                } else {
560                    return Err(NucleusError::FilesystemError(format!(
561                        "Failed to bind mount {}: {}",
562                        host_path, e
563                    )));
564                }
565            }
566        }
567    }
568
569    Ok(())
570}
571
572/// H7: Sensitive host paths that must not be bind-mounted into containers.
573const DENIED_BIND_MOUNT_SOURCES_EXACT: &[&str] = &["/"];
574
575/// Sensitive host subtrees that must not be exposed to a container at all.
576const DENIED_BIND_MOUNT_SOURCE_PREFIXES: &[&str] = &[
577    "/boot", "/dev", "/etc", "/home", "/proc", "/root", "/run", "/sys", "/var/log", "/var/run",
578];
579
580/// Container destinations where user-supplied volumes must not be mounted.
581///
582/// These paths carry trusted runtime/rootfs state. Allowing a volume over them
583/// would let a caller replace attested container contents or pseudo-filesystems
584/// after validation has completed.
585const RESERVED_VOLUME_DESTINATION_PREFIXES: &[&str] = &[
586    "/bin",
587    "/boot",
588    "/dev",
589    "/etc",
590    "/lib",
591    "/lib64",
592    "/nix",
593    "/proc",
594    "/run/secrets",
595    "/sbin",
596    "/sys",
597    "/usr",
598];
599
600fn normalize_bind_mount_source_for_policy(source: &Path) -> Result<PathBuf> {
601    if !source.is_absolute() {
602        return Err(NucleusError::ConfigError(format!(
603            "Bind mount source must be absolute: {:?}",
604            source
605        )));
606    }
607
608    let mut normalized = PathBuf::from("/");
609
610    for component in source.components() {
611        match component {
612            Component::RootDir => {}
613            Component::CurDir => {}
614            Component::Normal(part) => normalized.push(part),
615            Component::ParentDir => {
616                normalized.pop();
617                if normalized.as_os_str().is_empty() {
618                    normalized.push("/");
619                }
620            }
621            Component::Prefix(_) => {
622                return Err(NucleusError::ConfigError(format!(
623                    "Unsupported bind mount source prefix: {:?}",
624                    source
625                )));
626            }
627        }
628    }
629
630    Ok(normalized)
631}
632
633fn reject_denied_bind_mount_source(source: &Path) -> Result<()> {
634    for denied in DENIED_BIND_MOUNT_SOURCES_EXACT {
635        if source == Path::new(denied) {
636            return Err(NucleusError::ConfigError(format!(
637                "Bind mount source '{}' is a sensitive host path and cannot be mounted into containers",
638                source.display()
639            )));
640        }
641    }
642
643    for denied in DENIED_BIND_MOUNT_SOURCE_PREFIXES {
644        let denied_path = Path::new(denied);
645        if source == denied_path || source.starts_with(denied_path) {
646            return Err(NucleusError::ConfigError(format!(
647                "Bind mount source '{}' is under sensitive host path '{}' and cannot be mounted into containers",
648                source.display(),
649                denied
650            )));
651        }
652    }
653
654    Ok(())
655}
656
657/// Validate bind-mount source policy without requiring the source to exist.
658///
659/// Topology persistent volumes use this before creating missing host paths, so
660/// sensitive host locations are rejected before any mkdir/chown side effects.
661pub fn validate_bind_mount_source_policy(source: &Path) -> Result<PathBuf> {
662    let normalized = normalize_bind_mount_source_for_policy(source)?;
663    reject_denied_bind_mount_source(&normalized)?;
664    Ok(normalized)
665}
666
667/// Validate that a bind mount source exists and is not a sensitive host path or subtree.
668pub fn validate_bind_mount_source(source: &Path) -> Result<()> {
669    validate_bind_mount_source_policy(source)?;
670
671    let canonical = std::fs::canonicalize(source).map_err(|e| {
672        NucleusError::ConfigError(format!(
673            "Failed to resolve bind mount source {:?}: {}",
674            source, e
675        ))
676    })?;
677    reject_denied_bind_mount_source(&canonical)
678}
679
680fn reject_reserved_volume_destination(dest: &Path) -> Result<()> {
681    for reserved in RESERVED_VOLUME_DESTINATION_PREFIXES {
682        let reserved_path = Path::new(reserved);
683        if dest == reserved_path || dest.starts_with(reserved_path) {
684            return Err(NucleusError::ConfigError(format!(
685                "Volume destination '{}' is reserved for trusted container/runtime paths and cannot be overlaid",
686                dest.display()
687            )));
688        }
689    }
690
691    Ok(())
692}
693
694/// Normalize and validate a user-supplied volume destination inside the container.
695pub fn normalize_volume_destination(dest: &Path) -> Result<PathBuf> {
696    let normalized = normalize_container_destination(dest)?;
697    reject_reserved_volume_destination(&normalized)?;
698    Ok(normalized)
699}
700
701/// Resolve a validated user-supplied volume destination under a host-side root directory.
702pub fn resolve_volume_destination(root: &Path, dest: &Path) -> Result<PathBuf> {
703    let normalized = normalize_volume_destination(dest)?;
704    let relative = normalized.strip_prefix("/").map_err(|_| {
705        NucleusError::ConfigError(format!(
706            "Volume destination is not absolute after normalization: {:?}",
707            normalized
708        ))
709    })?;
710    Ok(root.join(relative))
711}
712
713/// Mount persistent bind volumes and ephemeral tmpfs volumes into the container root.
714pub fn mount_volumes(root: &Path, volumes: &[crate::container::VolumeMount]) -> Result<()> {
715    use crate::container::VolumeSource;
716
717    if volumes.is_empty() {
718        return Ok(());
719    }
720
721    info!("Mounting {} volume(s) into container", volumes.len());
722
723    for volume in volumes {
724        let dest = resolve_volume_destination(root, &volume.dest)?;
725
726        match &volume.source {
727            VolumeSource::Bind { source } => {
728                // H7: Deny bind-mounting sensitive host paths
729                validate_bind_mount_source(source)?;
730
731                // Use symlink_metadata (lstat) instead of .exists() to avoid
732                // following symlinks in the existence check (O_NOFOLLOW semantics).
733                if std::fs::symlink_metadata(source).is_err() {
734                    return Err(NucleusError::FilesystemError(format!(
735                        "Volume source does not exist: {:?}",
736                        source
737                    )));
738                }
739
740                if let Some(parent) = dest.parent() {
741                    std::fs::create_dir_all(parent).map_err(|e| {
742                        NucleusError::FilesystemError(format!(
743                            "Failed to create volume mount parent {:?}: {}",
744                            parent, e
745                        ))
746                    })?;
747                }
748
749                let recursive = source.is_dir();
750                if source.is_file() {
751                    std::fs::write(&dest, "").map_err(|e| {
752                        NucleusError::FilesystemError(format!(
753                            "Failed to create volume mount point {:?}: {}",
754                            dest, e
755                        ))
756                    })?;
757                } else {
758                    std::fs::create_dir_all(&dest).map_err(|e| {
759                        NucleusError::FilesystemError(format!(
760                            "Failed to create volume mount dir {:?}: {}",
761                            dest, e
762                        ))
763                    })?;
764                }
765
766                let initial_flags = if recursive {
767                    MsFlags::MS_BIND | MsFlags::MS_REC
768                } else {
769                    MsFlags::MS_BIND
770                };
771                mount(
772                    Some(source.as_path()),
773                    &dest,
774                    None::<&str>,
775                    initial_flags,
776                    None::<&str>,
777                )
778                .map_err(|e| {
779                    NucleusError::FilesystemError(format!(
780                        "Failed to bind mount volume {:?} -> {:?}: {}",
781                        source, dest, e
782                    ))
783                })?;
784
785                let mut remount_flags =
786                    MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_NOSUID | MsFlags::MS_NODEV;
787                if recursive {
788                    remount_flags |= MsFlags::MS_REC;
789                }
790                if volume.read_only {
791                    remount_flags |= MsFlags::MS_RDONLY;
792                }
793
794                mount(
795                    None::<&str>,
796                    &dest,
797                    None::<&str>,
798                    remount_flags,
799                    None::<&str>,
800                )
801                .map_err(|e| {
802                    NucleusError::FilesystemError(format!(
803                        "Failed to remount volume {:?} with final flags: {}",
804                        dest, e
805                    ))
806                })?;
807
808                info!(
809                    "Mounted bind volume {:?} -> {:?} ({})",
810                    source,
811                    volume.dest,
812                    if volume.read_only { "ro" } else { "rw" }
813                );
814            }
815            VolumeSource::Tmpfs { size } => {
816                std::fs::create_dir_all(&dest).map_err(|e| {
817                    NucleusError::FilesystemError(format!(
818                        "Failed to create tmpfs mount dir {:?}: {}",
819                        dest, e
820                    ))
821                })?;
822
823                // M8: Validate size parameter to prevent option injection.
824                // Only allow digits, optionally followed by K/M/G suffix.
825                if let Some(value) = size.as_ref() {
826                    let valid = value
827                        .chars()
828                        .all(|c| c.is_ascii_digit() || "kKmMgG".contains(c));
829                    if !valid || value.is_empty() {
830                        return Err(NucleusError::FilesystemError(format!(
831                            "Invalid tmpfs size value '{}': only digits with optional K/M/G suffix allowed",
832                            value
833                        )));
834                    }
835                }
836
837                // M7: Default to 64MB instead of half of physical RAM to
838                // prevent memory DoS from unbounded tmpfs volumes.
839                let mount_data = size
840                    .as_ref()
841                    .map(|value| format!("size={},mode=0700", value))
842                    .unwrap_or_else(|| "size=64M,mode=0700".to_string());
843
844                let mut flags = MsFlags::MS_NOSUID | MsFlags::MS_NODEV;
845                if volume.read_only {
846                    flags |= MsFlags::MS_RDONLY;
847                }
848                mount(
849                    Some("tmpfs"),
850                    &dest,
851                    Some("tmpfs"),
852                    flags,
853                    Some(mount_data.as_str()),
854                )
855                .map_err(|e| {
856                    NucleusError::FilesystemError(format!(
857                        "Failed to mount tmpfs volume at {:?}: {}",
858                        dest, e
859                    ))
860                })?;
861
862                info!(
863                    "Mounted tmpfs volume at {:?}{}{}",
864                    volume.dest,
865                    size.as_ref()
866                        .map(|value| format!(" (size={})", value))
867                        .unwrap_or_default(),
868                    if volume.read_only { " (ro)" } else { "" }
869                );
870            }
871        }
872    }
873
874    Ok(())
875}
876
877/// Mount procfs at the given path
878///
879/// In rootless mode, procfs mounting should work due to user namespace capabilities.
880/// When `hide_pids` is true, mounts with hidepid=2 so processes cannot enumerate
881/// other PIDs (production hardening). The best-effort fallback only applies to
882/// non-hardened procfs mounts; requested hidepid hardening is fail-closed.
883pub fn mount_procfs(
884    proc_path: &Path,
885    best_effort: bool,
886    read_only: bool,
887    hide_pids: bool,
888) -> Result<()> {
889    info!(
890        "Mounting procfs at {:?} (hidepid={})",
891        proc_path,
892        if hide_pids { "2" } else { "0" }
893    );
894
895    let mount_data: Option<&str> = if hide_pids { Some("hidepid=2") } else { None };
896
897    let mounted = match mount(
898        Some("proc"),
899        proc_path,
900        Some("proc"),
901        MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
902        mount_data,
903    ) {
904        Ok(_) => true,
905        Err(e) => handle_procfs_mount_failure(e, best_effort, hide_pids)?,
906    };
907
908    if mounted {
909        if read_only {
910            mount(
911                None::<&str>,
912                proc_path,
913                None::<&str>,
914                MsFlags::MS_REMOUNT
915                    | MsFlags::MS_RDONLY
916                    | MsFlags::MS_NOSUID
917                    | MsFlags::MS_NODEV
918                    | MsFlags::MS_NOEXEC,
919                mount_data,
920            )
921            .map_err(|e| {
922                NucleusError::FilesystemError(format!("Failed to remount procfs read-only: {}", e))
923            })?;
924            info!("Successfully mounted procfs (read-only)");
925        } else {
926            info!("Successfully mounted procfs");
927        }
928    }
929
930    Ok(())
931}
932
933fn handle_procfs_mount_failure(
934    e: nix::errno::Errno,
935    best_effort: bool,
936    hide_pids: bool,
937) -> Result<bool> {
938    if hide_pids {
939        return Err(NucleusError::FilesystemError(format!(
940            "Failed to mount procfs with required hidepid=2: {}",
941            e
942        )));
943    }
944
945    if best_effort {
946        warn!("Failed to mount procfs: {} (continuing anyway)", e);
947        Ok(false)
948    } else {
949        Err(NucleusError::FilesystemError(format!(
950            "Failed to mount procfs: {}",
951            e
952        )))
953    }
954}
955
956/// Paths to mask with /dev/null (files) – matches OCI runtime spec masked paths.
957/// Exposed for testing; the canonical list of sensitive /proc entries that must
958/// be hidden from container processes.
959pub const PROC_NULL_MASKED: &[&str] = &[
960    "kallsyms",
961    "kcore",
962    "sched_debug",
963    "timer_list",
964    "timer_stats",
965    "keys",
966    "latency_stats",
967    "config.gz",
968    "sysrq-trigger",
969    "kpagecount",
970    "kpageflags",
971    "kpagecgroup",
972];
973
974/// Paths to remount read-only – matches OCI runtime spec readonlyPaths.
975pub const PROC_READONLY_PATHS: &[&str] = &["bus", "fs", "irq", "sys"];
976
977/// Paths to mask with empty tmpfs (directories).
978pub const PROC_TMPFS_MASKED: &[&str] = &["acpi", "scsi"];
979
980fn remount_proc_path_readonly(target: &Path) -> Result<()> {
981    mount(
982        Some(target),
983        target,
984        None::<&str>,
985        MsFlags::MS_BIND | MsFlags::MS_REC,
986        None::<&str>,
987    )
988    .map_err(|e| {
989        NucleusError::FilesystemError(format!(
990            "Failed to bind-mount {:?} onto itself for read-only remount: {}",
991            target, e
992        ))
993    })?;
994
995    mount(
996        None::<&str>,
997        target,
998        None::<&str>,
999        MsFlags::MS_REMOUNT
1000            | MsFlags::MS_BIND
1001            | MsFlags::MS_RDONLY
1002            | MsFlags::MS_NOSUID
1003            | MsFlags::MS_NODEV
1004            | MsFlags::MS_NOEXEC,
1005        None::<&str>,
1006    )
1007    .map_err(|e| {
1008        NucleusError::FilesystemError(format!("Failed to remount {:?} read-only: {}", target, e))
1009    })?;
1010
1011    Ok(())
1012}
1013
1014/// Mask sensitive /proc paths by bind-mounting /dev/null or tmpfs over them
1015///
1016/// This reduces kernel information leakage from the container. Follows OCI runtime
1017/// conventions for masked paths.
1018///
1019/// SEC-06: When `production` is true, failures to mask critical paths
1020/// (kcore, kallsyms, sysrq-trigger) are fatal instead of warn-and-continue.
1021pub fn mask_proc_paths(proc_path: &Path, production: bool) -> Result<()> {
1022    info!("Masking sensitive /proc paths");
1023
1024    const CRITICAL_PROC_PATHS: &[&str] = &["kcore", "kallsyms", "sysrq-trigger"];
1025
1026    for name in PROC_READONLY_PATHS {
1027        let target = proc_path.join(name);
1028        if !target.exists() {
1029            continue;
1030        }
1031        match remount_proc_path_readonly(&target) {
1032            Ok(_) => debug!("Remounted /proc/{} read-only", name),
1033            Err(e) => {
1034                if production {
1035                    return Err(NucleusError::FilesystemError(format!(
1036                        "Failed to remount /proc/{} read-only in production mode: {}",
1037                        name, e
1038                    )));
1039                }
1040                warn!(
1041                    "Failed to remount /proc/{} read-only: {} (continuing)",
1042                    name, e
1043                );
1044            }
1045        }
1046    }
1047
1048    let dev_null = Path::new("/dev/null");
1049
1050    for name in PROC_NULL_MASKED {
1051        let target = proc_path.join(name);
1052        if !target.exists() {
1053            continue;
1054        }
1055        match mount(
1056            Some(dev_null),
1057            &target,
1058            None::<&str>,
1059            MsFlags::MS_BIND,
1060            None::<&str>,
1061        ) {
1062            Ok(_) => {
1063                // Remount read-only: Linux ignores MS_RDONLY on the initial bind mount,
1064                // so a separate MS_REMOUNT|MS_BIND|MS_RDONLY call is required.
1065                if let Err(e) = mount(
1066                    None::<&str>,
1067                    &target,
1068                    None::<&str>,
1069                    MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_RDONLY,
1070                    None::<&str>,
1071                ) {
1072                    if production && CRITICAL_PROC_PATHS.contains(name) {
1073                        return Err(NucleusError::FilesystemError(format!(
1074                            "Failed to remount /proc/{} read-only in production mode: {}",
1075                            name, e
1076                        )));
1077                    }
1078                    warn!(
1079                        "Failed to remount /proc/{} read-only: {} (continuing)",
1080                        name, e
1081                    );
1082                }
1083                debug!("Masked /proc/{} (read-only)", name);
1084            }
1085            Err(e) => {
1086                if production && CRITICAL_PROC_PATHS.contains(name) {
1087                    return Err(NucleusError::FilesystemError(format!(
1088                        "Failed to mask critical /proc/{} in production mode: {}",
1089                        name, e
1090                    )));
1091                }
1092                warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
1093            }
1094        }
1095    }
1096
1097    for name in PROC_TMPFS_MASKED {
1098        let target = proc_path.join(name);
1099        if !target.exists() {
1100            continue;
1101        }
1102        match mount(
1103            Some("tmpfs"),
1104            &target,
1105            Some("tmpfs"),
1106            MsFlags::MS_RDONLY | MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
1107            Some("size=0"),
1108        ) {
1109            Ok(_) => debug!("Masked /proc/{}", name),
1110            Err(e) => {
1111                if production {
1112                    return Err(NucleusError::FilesystemError(format!(
1113                        "Failed to mask /proc/{} in production mode: {}",
1114                        name, e
1115                    )));
1116                }
1117                warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
1118            }
1119        }
1120    }
1121
1122    info!("Finished masking sensitive /proc paths");
1123    Ok(())
1124}
1125
1126/// Switch to new root filesystem using pivot_root or chroot
1127///
1128/// This implements the transition: populated -> pivoted
1129/// Fails closed if root switching cannot be established.
1130pub fn switch_root(new_root: &Path, allow_chroot_fallback: bool) -> Result<()> {
1131    info!("Switching root to {:?}", new_root);
1132
1133    match pivot_root_impl(new_root) {
1134        Ok(()) => {
1135            info!("Successfully switched root using pivot_root");
1136            Ok(())
1137        }
1138        Err(e) => {
1139            if allow_chroot_fallback {
1140                warn!(
1141                    "pivot_root failed ({}), falling back to chroot due to explicit \
1142                     configuration",
1143                    e
1144                );
1145                chroot_impl(new_root)
1146            } else {
1147                Err(NucleusError::PivotRootError(format!(
1148                    "pivot_root failed: {}. chroot fallback is disabled by default; use \
1149                     --allow-chroot-fallback to allow weaker isolation",
1150                    e
1151                )))
1152            }
1153        }
1154    }
1155}
1156
1157/// Implement root switch using pivot_root(2)
1158///
1159/// pivot_root is preferred over chroot because:
1160/// - More secure (old root can be unmounted)
1161/// - Works better with mount namespaces
1162fn pivot_root_impl(new_root: &Path) -> Result<()> {
1163    use nix::unistd::pivot_root;
1164
1165    // pivot_root requires new_root to be a mount point
1166    // and old_root to be under new_root
1167
1168    let old_root = new_root.join(".old_root");
1169    std::fs::create_dir_all(&old_root).map_err(|e| {
1170        NucleusError::PivotRootError(format!("Failed to create old_root directory: {}", e))
1171    })?;
1172
1173    // Perform pivot_root
1174    pivot_root(new_root, &old_root)
1175        .map_err(|e| NucleusError::PivotRootError(format!("pivot_root syscall failed: {}", e)))?;
1176
1177    // Change to new root
1178    std::env::set_current_dir("/")
1179        .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
1180
1181    // Unmount old root
1182    nix::mount::umount2("/.old_root", nix::mount::MntFlags::MNT_DETACH)
1183        .map_err(|e| NucleusError::PivotRootError(format!("Failed to unmount old root: {}", e)))?;
1184
1185    // Remove old root directory
1186    let _ = std::fs::remove_dir("/.old_root");
1187
1188    Ok(())
1189}
1190
1191/// Implement root switch using chroot(2)
1192///
1193/// chroot is less secure than pivot_root but works in more situations
1194fn chroot_impl(new_root: &Path) -> Result<()> {
1195    fn close_non_stdio_fds_after_chroot() -> Result<()> {
1196        // Any pre-chroot fd can still reach outside the jail, so close every
1197        // non-stdio descriptor before continuing setup inside the fallback root.
1198        let ret = unsafe { libc::syscall(libc::SYS_close_range, 3u32, u32::MAX, 0u32) };
1199        if ret == 0 {
1200            return Ok(());
1201        }
1202
1203        let max_fd = match unsafe { libc::sysconf(libc::_SC_OPEN_MAX) } {
1204            n if n > 3 && n <= i32::MAX as libc::c_long => n as i32,
1205            _ => 1024,
1206        };
1207
1208        for fd in 3..max_fd {
1209            if unsafe { libc::close(fd) } != 0 {
1210                let err = std::io::Error::last_os_error();
1211                if err.raw_os_error() != Some(libc::EBADF) {
1212                    return Err(NucleusError::PivotRootError(format!(
1213                        "Failed to close inherited fd {} after chroot: {}",
1214                        fd, err
1215                    )));
1216                }
1217            }
1218        }
1219
1220        Ok(())
1221    }
1222
1223    chroot(new_root)
1224        .map_err(|e| NucleusError::PivotRootError(format!("chroot syscall failed: {}", e)))?;
1225
1226    // Change to new root
1227    std::env::set_current_dir("/")
1228        .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
1229
1230    close_non_stdio_fds_after_chroot()?;
1231
1232    // L3: Drop CAP_SYS_CHROOT after chroot to prevent escape via nested chroot.
1233    if let Err(e) = caps::drop(
1234        None,
1235        caps::CapSet::Bounding,
1236        caps::Capability::CAP_SYS_CHROOT,
1237    ) {
1238        debug!(
1239            "Could not drop CAP_SYS_CHROOT after chroot: {} (may not be present)",
1240            e
1241        );
1242    }
1243    if let Err(e) = caps::drop(
1244        None,
1245        caps::CapSet::Effective,
1246        caps::Capability::CAP_SYS_CHROOT,
1247    ) {
1248        debug!(
1249            "Could not drop effective CAP_SYS_CHROOT: {} (may not be present)",
1250            e
1251        );
1252    }
1253    if let Err(e) = caps::drop(
1254        None,
1255        caps::CapSet::Permitted,
1256        caps::Capability::CAP_SYS_CHROOT,
1257    ) {
1258        debug!(
1259            "Could not drop permitted CAP_SYS_CHROOT: {} (may not be present)",
1260            e
1261        );
1262    }
1263
1264    info!("Successfully switched root using chroot (CAP_SYS_CHROOT dropped)");
1265
1266    Ok(())
1267}
1268
1269/// Mount secret files into the container root.
1270///
1271/// Each secret is bind-mounted read-only from its source to the destination
1272/// path inside the container. Intermediate directories are created as needed.
1273pub fn mount_secrets(root: &Path, secrets: &[crate::container::SecretMount]) -> Result<()> {
1274    if secrets.is_empty() {
1275        return Ok(());
1276    }
1277
1278    info!("Mounting {} secret(s) into container", secrets.len());
1279
1280    for secret in secrets {
1281        let source_fd = open(
1282            &secret.source,
1283            OFlag::O_PATH | OFlag::O_NOFOLLOW | OFlag::O_CLOEXEC,
1284            Mode::empty(),
1285        )
1286        .map_err(|e| {
1287            NucleusError::FilesystemError(format!(
1288                "Failed to open secret source {:?} with O_NOFOLLOW: {}",
1289                secret.source, e
1290            ))
1291        })?;
1292        let source_stat = fstat(&source_fd).map_err(|e| {
1293            NucleusError::FilesystemError(format!(
1294                "Failed to stat secret source {:?}: {}",
1295                secret.source, e
1296            ))
1297        })?;
1298        let source_kind = SFlag::from_bits_truncate(source_stat.st_mode);
1299        let source_is_file = source_kind == SFlag::S_IFREG;
1300        let source_is_dir = source_kind == SFlag::S_IFDIR;
1301        if !source_is_file && !source_is_dir {
1302            return Err(NucleusError::FilesystemError(format!(
1303                "Secret source {:?} must be a regular file or directory",
1304                secret.source
1305            )));
1306        }
1307        let source_fd_path = PathBuf::from(format!("/proc/self/fd/{}", source_fd.as_raw_fd()));
1308
1309        // Destination inside container root
1310        let dest = resolve_container_destination(root, &secret.dest)?;
1311
1312        // Create parent directories
1313        if let Some(parent) = dest.parent() {
1314            std::fs::create_dir_all(parent).map_err(|e| {
1315                NucleusError::FilesystemError(format!(
1316                    "Failed to create secret mount parent {:?}: {}",
1317                    parent, e
1318                ))
1319            })?;
1320        }
1321
1322        // Create mount point file
1323        if source_is_file {
1324            std::fs::write(&dest, "").map_err(|e| {
1325                NucleusError::FilesystemError(format!(
1326                    "Failed to create secret mount point {:?}: {}",
1327                    dest, e
1328                ))
1329            })?;
1330        } else {
1331            std::fs::create_dir_all(&dest).map_err(|e| {
1332                NucleusError::FilesystemError(format!(
1333                    "Failed to create secret mount dir {:?}: {}",
1334                    dest, e
1335                ))
1336            })?;
1337        }
1338
1339        // Bind mount read-only
1340        mount(
1341            Some(source_fd_path.as_path()),
1342            &dest,
1343            None::<&str>,
1344            MsFlags::MS_BIND,
1345            None::<&str>,
1346        )
1347        .map_err(|e| {
1348            NucleusError::FilesystemError(format!(
1349                "Failed to bind mount secret {:?}: {}",
1350                secret.source, e
1351            ))
1352        })?;
1353
1354        mount(
1355            None::<&str>,
1356            &dest,
1357            None::<&str>,
1358            MsFlags::MS_REMOUNT
1359                | MsFlags::MS_BIND
1360                | MsFlags::MS_RDONLY
1361                | MsFlags::MS_NOSUID
1362                | MsFlags::MS_NODEV
1363                | MsFlags::MS_NOEXEC,
1364            None::<&str>,
1365        )
1366        .map_err(|e| {
1367            NucleusError::FilesystemError(format!(
1368                "Failed to remount secret {:?} read-only: {}",
1369                dest, e
1370            ))
1371        })?;
1372
1373        // Apply configured file permissions on the mount point
1374        if source_is_file {
1375            use std::os::unix::fs::PermissionsExt;
1376            let perms = std::fs::Permissions::from_mode(secret.mode);
1377            if let Err(e) = std::fs::set_permissions(&dest, perms) {
1378                warn!(
1379                    "Failed to set mode {:04o} on secret {:?}: {} (bind mount may override)",
1380                    secret.mode, dest, e
1381                );
1382            }
1383        }
1384
1385        debug!(
1386            "Mounted secret {:?} -> {:?} (mode {:04o})",
1387            secret.source, secret.dest, secret.mode
1388        );
1389    }
1390
1391    Ok(())
1392}
1393
1394/// Mount secrets onto a dedicated in-memory tmpfs instead of bind-mounting host paths.
1395///
1396/// Creates a per-container tmpfs at `<root>/run/secrets` with MS_NOEXEC | MS_NOSUID | MS_NODEV,
1397/// copies secret contents into it, then zeros the read buffer. This ensures secrets
1398/// never reference host-side files after setup and are never persisted to disk.
1399pub fn mount_secrets_inmemory(
1400    root: &Path,
1401    secrets: &[crate::container::SecretMount],
1402    identity: &crate::container::ProcessIdentity,
1403) -> Result<()> {
1404    if secrets.is_empty() {
1405        return Ok(());
1406    }
1407
1408    info!("Mounting {} secret(s) on in-memory tmpfs", secrets.len());
1409
1410    let secrets_dir = root.join("run/secrets");
1411    std::fs::create_dir_all(&secrets_dir).map_err(|e| {
1412        NucleusError::FilesystemError(format!(
1413            "Failed to create secrets dir {:?}: {}",
1414            secrets_dir, e
1415        ))
1416    })?;
1417
1418    // Mount a size-limited tmpfs for secrets (16 MiB max)
1419    if let Err(e) = mount(
1420        Some("tmpfs"),
1421        &secrets_dir,
1422        Some("tmpfs"),
1423        MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
1424        Some("size=16m,mode=0700"),
1425    ) {
1426        let _ = std::fs::remove_dir_all(&secrets_dir);
1427        return Err(NucleusError::FilesystemError(format!(
1428            "Failed to mount secrets tmpfs at {:?}: {}",
1429            secrets_dir, e
1430        )));
1431    }
1432
1433    if !identity.is_root() {
1434        nix::unistd::chown(
1435            &secrets_dir,
1436            Some(nix::unistd::Uid::from_raw(identity.uid)),
1437            Some(nix::unistd::Gid::from_raw(identity.gid)),
1438        )
1439        .map_err(|e| {
1440            let _ = nix::mount::umount2(&secrets_dir, nix::mount::MntFlags::MNT_DETACH);
1441            let _ = std::fs::remove_dir_all(&secrets_dir);
1442            NucleusError::FilesystemError(format!(
1443                "Failed to set /run/secrets owner to {}:{}: {}",
1444                identity.uid, identity.gid, e
1445            ))
1446        })?;
1447    }
1448
1449    // Rollback: unmount tmpfs and remove dir if any secret fails
1450    let result = mount_secrets_inmemory_inner(&secrets_dir, root, secrets, identity);
1451    if let Err(ref e) = result {
1452        let _ = nix::mount::umount2(&secrets_dir, nix::mount::MntFlags::MNT_DETACH);
1453        let _ = std::fs::remove_dir_all(&secrets_dir);
1454        return Err(NucleusError::FilesystemError(format!(
1455            "Secret mount failed (rolled back): {}",
1456            e
1457        )));
1458    }
1459
1460    info!("All secrets mounted on in-memory tmpfs");
1461    Ok(())
1462}
1463
1464fn mount_secrets_inmemory_inner(
1465    secrets_dir: &Path,
1466    root: &Path,
1467    secrets: &[crate::container::SecretMount],
1468    identity: &crate::container::ProcessIdentity,
1469) -> Result<()> {
1470    for secret in secrets {
1471        let mut content = read_regular_file_nofollow(&secret.source)?;
1472
1473        // Determine destination path inside the secrets tmpfs
1474        let dest = resolve_container_destination(secrets_dir, &secret.dest)?;
1475
1476        // Create parent directories within the tmpfs
1477        if let Some(parent) = dest.parent() {
1478            std::fs::create_dir_all(parent).map_err(|e| {
1479                NucleusError::FilesystemError(format!(
1480                    "Failed to create secret parent dir {:?}: {}",
1481                    parent, e
1482                ))
1483            })?;
1484        }
1485
1486        // Write secret content to tmpfs
1487        std::fs::write(&dest, &content).map_err(|e| {
1488            NucleusError::FilesystemError(format!("Failed to write secret to {:?}: {}", dest, e))
1489        })?;
1490
1491        // Set permissions
1492        {
1493            use std::os::unix::fs::PermissionsExt;
1494            let perms = std::fs::Permissions::from_mode(secret.mode);
1495            std::fs::set_permissions(&dest, perms).map_err(|e| {
1496                NucleusError::FilesystemError(format!(
1497                    "Failed to set permissions on secret {:?}: {}",
1498                    dest, e
1499                ))
1500            })?;
1501        }
1502
1503        if !identity.is_root() {
1504            nix::unistd::chown(
1505                &dest,
1506                Some(nix::unistd::Uid::from_raw(identity.uid)),
1507                Some(nix::unistd::Gid::from_raw(identity.gid)),
1508            )
1509            .map_err(|e| {
1510                NucleusError::FilesystemError(format!(
1511                    "Failed to set permissions owner on secret {:?} to {}:{}: {}",
1512                    dest, identity.uid, identity.gid, e
1513                ))
1514            })?;
1515        }
1516
1517        // Zero the in-memory buffer
1518        zeroize::Zeroize::zeroize(&mut content);
1519        drop(content);
1520
1521        // Also bind-mount the secret to its expected container path for compatibility
1522        let container_dest = resolve_container_destination(root, &secret.dest)?;
1523        if container_dest != dest {
1524            if let Some(parent) = container_dest.parent() {
1525                std::fs::create_dir_all(parent).map_err(|e| {
1526                    NucleusError::FilesystemError(format!(
1527                        "Failed to create secret mount parent {:?}: {}",
1528                        parent, e
1529                    ))
1530                })?;
1531            }
1532
1533            std::fs::write(&container_dest, "").map_err(|e| {
1534                NucleusError::FilesystemError(format!(
1535                    "Failed to create secret mount point {:?}: {}",
1536                    container_dest, e
1537                ))
1538            })?;
1539
1540            mount(
1541                Some(dest.as_path()),
1542                &container_dest,
1543                None::<&str>,
1544                MsFlags::MS_BIND,
1545                None::<&str>,
1546            )
1547            .map_err(|e| {
1548                NucleusError::FilesystemError(format!(
1549                    "Failed to bind mount secret {:?} -> {:?}: {}",
1550                    dest, container_dest, e
1551                ))
1552            })?;
1553
1554            mount(
1555                None::<&str>,
1556                &container_dest,
1557                None::<&str>,
1558                MsFlags::MS_REMOUNT
1559                    | MsFlags::MS_BIND
1560                    | MsFlags::MS_RDONLY
1561                    | MsFlags::MS_NOSUID
1562                    | MsFlags::MS_NODEV
1563                    | MsFlags::MS_NOEXEC,
1564                None::<&str>,
1565            )
1566            .map_err(|e| {
1567                NucleusError::FilesystemError(format!(
1568                    "Failed to remount secret {:?} read-only: {}",
1569                    container_dest, e
1570                ))
1571            })?;
1572        }
1573
1574        debug!(
1575            "Secret {:?} -> {:?} (in-memory tmpfs, mode {:04o})",
1576            secret.source, secret.dest, secret.mode
1577        );
1578    }
1579
1580    Ok(())
1581}
1582
1583#[cfg(test)]
1584mod tests {
1585    use super::*;
1586    use std::os::unix::fs::symlink;
1587
1588    #[test]
1589    fn test_validate_bind_mount_source_rejects_sensitive_subtrees() {
1590        for path in [
1591            "/",
1592            "/boot",
1593            "/dev/kmsg",
1594            "/etc",
1595            "/etc/passwd",
1596            "/home/alice/.ssh",
1597            "/proc/sys",
1598            "/root/.ssh",
1599            "/run/secrets",
1600            "/sys/fs/cgroup",
1601            "/var/log",
1602        ] {
1603            let err = validate_bind_mount_source(Path::new(path)).unwrap_err();
1604            assert!(
1605                err.to_string().contains("sensitive host path"),
1606                "expected sensitive-path rejection for {path}, got: {err}"
1607            );
1608        }
1609    }
1610
1611    #[test]
1612    fn test_validate_bind_mount_source_allows_regular_host_paths() {
1613        let temp = tempfile::TempDir::new().unwrap();
1614        let safe_path = temp.path().join("data");
1615        std::fs::create_dir(&safe_path).unwrap();
1616
1617        validate_bind_mount_source(&safe_path).unwrap();
1618    }
1619
1620    #[test]
1621    fn test_validate_bind_mount_source_normalizes_parent_components_before_filtering() {
1622        let temp = tempfile::TempDir::new().unwrap();
1623        let safe_path = temp.path().join("data");
1624        std::fs::create_dir(&safe_path).unwrap();
1625
1626        validate_bind_mount_source(&safe_path.join("../data")).unwrap();
1627    }
1628
1629    #[test]
1630    fn test_bind_mount_source_policy_rejects_sensitive_paths_before_creation() {
1631        let err = validate_bind_mount_source_policy(Path::new("/tmp/../../etc/nucleus-volume"))
1632            .unwrap_err();
1633        assert!(
1634            err.to_string().contains("sensitive host path"),
1635            "expected sensitive-path rejection before path creation, got: {err}"
1636        );
1637    }
1638
1639    #[test]
1640    fn test_volume_destinations_reject_reserved_container_paths() {
1641        for path in [
1642            "/bin/tool",
1643            "/dev/null",
1644            "/etc/app",
1645            "/lib64/ld-linux-x86-64.so.2",
1646            "/nix/store/data",
1647            "/proc/sys",
1648            "/run/secrets/token",
1649            "/usr/local/bin",
1650        ] {
1651            let err = normalize_volume_destination(Path::new(path)).unwrap_err();
1652            assert!(
1653                err.to_string().contains("reserved"),
1654                "expected reserved destination rejection for {path}, got: {err}"
1655            );
1656        }
1657    }
1658
1659    #[test]
1660    fn test_volume_destinations_allow_data_paths() {
1661        assert_eq!(
1662            normalize_volume_destination(Path::new("/var/lib/app")).unwrap(),
1663            PathBuf::from("/var/lib/app")
1664        );
1665        assert_eq!(
1666            normalize_volume_destination(Path::new("/opt/app/data")).unwrap(),
1667            PathBuf::from("/opt/app/data")
1668        );
1669    }
1670
1671    #[test]
1672    fn test_production_rootfs_path_rejects_parent_traversal() {
1673        let temp = tempfile::TempDir::new().unwrap();
1674        let store = temp.path().join("store");
1675        std::fs::create_dir(&store).unwrap();
1676
1677        let err =
1678            validate_rootfs_path_under_store(&store.join("../outside-rootfs"), &store).unwrap_err();
1679
1680        assert!(
1681            err.to_string().contains("parent traversal"),
1682            "expected parent traversal rejection, got: {err}"
1683        );
1684    }
1685
1686    #[test]
1687    fn test_production_rootfs_path_rejects_symlink_escape() {
1688        let temp = tempfile::TempDir::new().unwrap();
1689        let store = temp.path().join("store");
1690        let outside = temp.path().join("outside-rootfs");
1691        std::fs::create_dir(&store).unwrap();
1692        std::fs::create_dir(&outside).unwrap();
1693        symlink(&outside, store.join("rootfs-link")).unwrap();
1694
1695        let err = validate_rootfs_path_under_store(&store.join("rootfs-link"), &store).unwrap_err();
1696
1697        assert!(
1698            err.to_string().contains("resolve under"),
1699            "expected symlink escape rejection, got: {err}"
1700        );
1701    }
1702
1703    #[test]
1704    fn test_production_rootfs_path_returns_canonical_store_target() {
1705        let temp = tempfile::TempDir::new().unwrap();
1706        let store = temp.path().join("store");
1707        let rootfs = store.join("abcd-rootfs");
1708        std::fs::create_dir(&store).unwrap();
1709        std::fs::create_dir(&rootfs).unwrap();
1710        symlink(&rootfs, store.join("rootfs-link")).unwrap();
1711
1712        let canonical =
1713            validate_rootfs_path_under_store(&store.join("rootfs-link"), &store).unwrap();
1714
1715        assert_eq!(canonical, std::fs::canonicalize(rootfs).unwrap());
1716    }
1717
1718    #[test]
1719    fn test_proc_mask_includes_sysrq_trigger() {
1720        assert!(
1721            PROC_NULL_MASKED.contains(&"sysrq-trigger"),
1722            "/proc/sysrq-trigger must be masked to prevent host DoS"
1723        );
1724    }
1725
1726    #[test]
1727    fn test_proc_mask_includes_timer_stats() {
1728        assert!(
1729            PROC_NULL_MASKED.contains(&"timer_stats"),
1730            "/proc/timer_stats must be masked to prevent kernel info leakage"
1731        );
1732    }
1733
1734    #[test]
1735    fn test_proc_mask_includes_kpage_files() {
1736        for path in &["kpagecount", "kpageflags", "kpagecgroup"] {
1737            assert!(
1738                PROC_NULL_MASKED.contains(path),
1739                "/proc/{} must be masked to prevent host memory layout leakage",
1740                path
1741            );
1742        }
1743    }
1744
1745    #[test]
1746    fn test_proc_mask_includes_oci_standard_paths() {
1747        // OCI runtime spec required masked paths
1748        for path in &["kallsyms", "kcore", "sched_debug", "keys", "config.gz"] {
1749            assert!(
1750                PROC_NULL_MASKED.contains(path),
1751                "/proc/{} must be in null-masked list (OCI spec)",
1752                path
1753            );
1754        }
1755        for path in &["acpi", "scsi"] {
1756            assert!(
1757                PROC_TMPFS_MASKED.contains(path),
1758                "/proc/{} must be in tmpfs-masked list (OCI spec)",
1759                path
1760            );
1761        }
1762        for path in &["bus", "fs", "irq", "sys"] {
1763            assert!(
1764                PROC_READONLY_PATHS.contains(path),
1765                "/proc/{} must be in read-only remount list (OCI spec)",
1766                path
1767            );
1768            assert!(
1769                !PROC_TMPFS_MASKED.contains(path),
1770                "/proc/{} must stay visible read-only, not hidden behind tmpfs",
1771                path
1772            );
1773        }
1774    }
1775
1776    #[test]
1777    fn test_procfs_hidepid_failure_fails_closed_even_best_effort() {
1778        let err = handle_procfs_mount_failure(nix::errno::Errno::EINVAL, true, true).unwrap_err();
1779
1780        assert!(
1781            err.to_string().contains("required hidepid=2"),
1782            "hidepid=2 failures must remain fatal in production/rootless paths, got: {err}"
1783        );
1784    }
1785
1786    #[test]
1787    fn test_procfs_best_effort_only_applies_without_hidepid() {
1788        assert!(
1789            !handle_procfs_mount_failure(nix::errno::Errno::EPERM, true, false).unwrap(),
1790            "best-effort procfs mount failures may only continue when hidepid was not requested"
1791        );
1792    }
1793
1794    #[test]
1795    fn test_parse_mountinfo_line_uses_mountinfo_mount_point_and_flags() {
1796        let line =
1797            "36 25 0:32 / /run/secrets rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,size=1024k";
1798        let (mount_point, flags) = parse_mountinfo_line(line).unwrap();
1799
1800        assert_eq!(mount_point, "/run/secrets");
1801        assert!(flags.contains("nosuid"));
1802        assert!(flags.contains("nodev"));
1803        assert!(flags.contains("noexec"));
1804    }
1805
1806    #[test]
1807    fn test_parse_mountinfo_line_decodes_escaped_mount_points() {
1808        let line = "41 25 0:40 / /path\\040with\\040spaces ro,nosuid,nodev - ext4 /dev/root ro";
1809        let (mount_point, flags) = parse_mountinfo_line(line).unwrap();
1810
1811        assert_eq!(mount_point, "/path with spaces");
1812        assert!(flags.contains("ro"));
1813    }
1814
1815    #[test]
1816    fn test_chroot_impl_closes_non_stdio_fds() {
1817        let source = include_str!("mount.rs");
1818        let fn_start = source.find("fn chroot_impl").unwrap();
1819        let after = &source[fn_start..];
1820        let open = after.find('{').unwrap();
1821        let mut depth = 0u32;
1822        let mut fn_end = open;
1823        for (i, ch) in after[open..].char_indices() {
1824            match ch {
1825                '{' => depth += 1,
1826                '}' => {
1827                    depth -= 1;
1828                    if depth == 0 {
1829                        fn_end = open + i + 1;
1830                        break;
1831                    }
1832                }
1833                _ => {}
1834            }
1835        }
1836        let body = &after[..fn_end];
1837        assert!(
1838            body.contains("close_non_stdio_fds_after_chroot()?"),
1839            "chroot fallback must close inherited non-stdio fds before continuing setup"
1840        );
1841    }
1842
1843    #[test]
1844    fn test_read_regular_file_nofollow_reads_regular_file() {
1845        let temp = tempfile::tempdir().unwrap();
1846        let path = temp.path().join("secret.txt");
1847        std::fs::write(&path, "supersecret").unwrap();
1848
1849        let content = read_regular_file_nofollow(&path).unwrap();
1850        assert_eq!(content, b"supersecret");
1851    }
1852
1853    #[test]
1854    fn test_read_regular_file_nofollow_rejects_symlink() {
1855        let temp = tempfile::tempdir().unwrap();
1856        let target = temp.path().join("target.txt");
1857        let link = temp.path().join("secret-link");
1858        std::fs::write(&target, "supersecret").unwrap();
1859        symlink(&target, &link).unwrap();
1860
1861        let err = read_regular_file_nofollow(&link).unwrap_err();
1862        assert!(
1863            err.to_string().contains("O_NOFOLLOW"),
1864            "symlink reads must fail via O_NOFOLLOW"
1865        );
1866    }
1867}