1use crate::error::{NucleusError, Result};
2use nix::fcntl::{open, OFlag};
3use nix::mount::{mount, MsFlags};
4use nix::sys::stat::{fstat, makedev, mknod, Mode, SFlag};
5use nix::unistd::chroot;
6use std::fs::OpenOptions;
7use std::io::Read;
8use std::os::fd::AsRawFd;
9use std::os::unix::fs::OpenOptionsExt;
10use std::path::{Component, Path, PathBuf};
11use tracing::{debug, info, warn};
12
13struct ExpectedMount {
15 path: &'static str,
16 required_flags: &'static [&'static str],
17 critical: bool,
20}
21
22const PRODUCTION_MOUNT_EXPECTATIONS: &[ExpectedMount] = &[
24 ExpectedMount {
25 path: "/bin",
26 required_flags: &["ro", "nosuid", "nodev"],
27 critical: true,
28 },
29 ExpectedMount {
30 path: "/usr",
31 required_flags: &["ro", "nosuid", "nodev"],
32 critical: true,
33 },
34 ExpectedMount {
35 path: "/lib",
36 required_flags: &["ro", "nosuid", "nodev"],
37 critical: false, },
39 ExpectedMount {
40 path: "/lib64",
41 required_flags: &["ro", "nosuid", "nodev"],
42 critical: false, },
44 ExpectedMount {
45 path: "/etc",
46 required_flags: &["ro", "nosuid", "nodev"],
47 critical: true,
48 },
49 ExpectedMount {
50 path: "/nix",
51 required_flags: &["ro", "nosuid", "nodev"],
52 critical: false, },
54 ExpectedMount {
55 path: "/sbin",
56 required_flags: &["ro", "nosuid", "nodev"],
57 critical: false, },
59 ExpectedMount {
60 path: "/proc",
61 required_flags: &["nosuid", "nodev", "noexec"],
62 critical: true,
63 },
64 ExpectedMount {
65 path: "/run/secrets",
66 required_flags: &["nosuid", "nodev", "noexec"],
67 critical: false, },
69];
70
71pub fn normalize_container_destination(dest: &Path) -> Result<PathBuf> {
76 if !dest.is_absolute() {
77 return Err(NucleusError::ConfigError(format!(
78 "Container destination must be absolute: {:?}",
79 dest
80 )));
81 }
82
83 let mut normalized = PathBuf::from("/");
84 let mut saw_component = false;
85
86 for component in dest.components() {
87 match component {
88 Component::RootDir => {}
89 Component::CurDir => {}
90 Component::Normal(part) => {
91 normalized.push(part);
92 saw_component = true;
93 }
94 Component::ParentDir => {
95 return Err(NucleusError::ConfigError(format!(
96 "Container destination must not contain parent traversal: {:?}",
97 dest
98 )));
99 }
100 Component::Prefix(_) => {
101 return Err(NucleusError::ConfigError(format!(
102 "Unsupported container destination prefix: {:?}",
103 dest
104 )));
105 }
106 }
107 }
108
109 if !saw_component {
110 return Err(NucleusError::ConfigError(format!(
111 "Container destination must not be the root directory: {:?}",
112 dest
113 )));
114 }
115
116 Ok(normalized)
117}
118
119pub fn resolve_container_destination(root: &Path, dest: &Path) -> Result<PathBuf> {
121 let normalized = normalize_container_destination(dest)?;
122 let relative = normalized.strip_prefix("/").map_err(|_| {
123 NucleusError::ConfigError(format!(
124 "Container destination is not absolute after normalization: {:?}",
125 normalized
126 ))
127 })?;
128 Ok(root.join(relative))
129}
130
131pub(crate) fn read_regular_file_nofollow(path: &Path) -> Result<Vec<u8>> {
132 let mut file = OpenOptions::new()
133 .read(true)
134 .custom_flags(libc::O_NOFOLLOW | libc::O_CLOEXEC)
135 .open(path)
136 .map_err(|e| {
137 NucleusError::FilesystemError(format!(
138 "Failed to open file {:?} with O_NOFOLLOW: {}",
139 path, e
140 ))
141 })?;
142
143 let metadata = file.metadata().map_err(|e| {
144 NucleusError::FilesystemError(format!("Failed to stat file {:?}: {}", path, e))
145 })?;
146 if !metadata.is_file() {
147 return Err(NucleusError::FilesystemError(format!(
148 "Expected regular file for {:?}, found non-file source",
149 path
150 )));
151 }
152
153 let mut content = Vec::new();
154 file.read_to_end(&mut content).map_err(|e| {
155 NucleusError::FilesystemError(format!("Failed to read file {:?}: {}", path, e))
156 })?;
157 Ok(content)
158}
159
160pub fn audit_mounts(production_mode: bool) -> Result<()> {
166 let mounts_content = std::fs::read_to_string("/proc/self/mounts").map_err(|e| {
167 NucleusError::FilesystemError(format!("Failed to read /proc/self/mounts: {}", e))
168 })?;
169
170 let mut violations = Vec::new();
171
172 for expectation in PRODUCTION_MOUNT_EXPECTATIONS {
173 let mount_entry = mounts_content.lines().find(|line| {
175 let parts: Vec<&str> = line.split_whitespace().collect();
176 parts.len() >= 4 && parts[1] == expectation.path
177 });
178
179 if let Some(entry) = mount_entry {
180 let parts: Vec<&str> = entry.split_whitespace().collect();
181 if parts.len() >= 4 {
182 let options = parts[3];
183 for &flag in expectation.required_flags {
184 if !options.split(',').any(|opt| opt == flag) {
185 violations.push(format!(
186 "Mount {} missing required flag '{}' (has: {})",
187 expectation.path, flag, options
188 ));
189 }
190 }
191 }
192 } else if expectation.critical && production_mode {
193 violations.push(format!(
194 "Critical mount {} is missing from the mount namespace",
195 expectation.path
196 ));
197 }
198 }
199
200 if violations.is_empty() {
201 info!("Mount audit passed: all expected flags verified");
202 Ok(())
203 } else if production_mode {
204 Err(NucleusError::FilesystemError(format!(
205 "Mount audit failed in production mode:\n {}",
206 violations.join("\n ")
207 )))
208 } else {
209 for v in &violations {
210 warn!("Mount audit: {}", v);
211 }
212 Ok(())
213 }
214}
215
216pub fn create_minimal_fs(root: &Path) -> Result<()> {
218 info!("Creating minimal filesystem structure at {:?}", root);
219
220 let dirs = vec![
222 "dev",
223 "proc",
224 "sys",
225 "tmp",
226 "bin",
227 "sbin",
228 "usr",
229 "lib",
230 "lib64",
231 "etc",
232 "nix",
233 "nix/store",
234 "run",
235 "context",
236 ];
237
238 for dir in dirs {
239 let path = root.join(dir);
240 std::fs::create_dir_all(&path).map_err(|e| {
241 NucleusError::FilesystemError(format!("Failed to create directory {:?}: {}", path, e))
242 })?;
243 }
244
245 info!("Created minimal filesystem structure");
246
247 Ok(())
248}
249
250pub fn create_dev_nodes(dev_path: &Path, include_tty: bool) -> Result<()> {
254 info!("Creating device nodes at {:?}", dev_path);
255
256 let mut devices = vec![
258 ("null", SFlag::S_IFCHR, 1, 3),
259 ("zero", SFlag::S_IFCHR, 1, 5),
260 ("full", SFlag::S_IFCHR, 1, 7),
261 ("random", SFlag::S_IFCHR, 1, 8),
262 ("urandom", SFlag::S_IFCHR, 1, 9),
263 ];
264 if include_tty {
265 devices.push(("tty", SFlag::S_IFCHR, 5, 0));
266 }
267
268 let mut created_count = 0;
269 let mut failed_count = 0;
270
271 for (name, dev_type, major, minor) in devices {
272 let path = dev_path.join(name);
273 let mode = Mode::from_bits_truncate(0o660);
274 let dev = makedev(major, minor);
275
276 match mknod(&path, dev_type, mode, dev) {
277 Ok(_) => {
278 info!("Created device node: {:?}", path);
279 created_count += 1;
280 }
281 Err(e) => {
282 warn!(
284 "Failed to create device node {:?}: {} (this is normal in rootless mode)",
285 path, e
286 );
287 failed_count += 1;
288 }
289 }
290 }
291
292 if created_count > 0 {
293 info!("Successfully created {} device nodes", created_count);
294 }
295 if failed_count > 0 {
296 info!("Skipped {} device nodes (rootless mode)", failed_count);
297 }
298
299 Ok(())
300}
301
302pub fn bind_mount_rootfs(root: &Path, rootfs_path: &Path) -> Result<()> {
307 info!(
308 "Bind mounting production rootfs {:?} into container {:?}",
309 rootfs_path, root
310 );
311
312 if std::fs::symlink_metadata(rootfs_path).is_err() {
313 return Err(NucleusError::FilesystemError(format!(
314 "Rootfs path does not exist: {:?}",
315 rootfs_path
316 )));
317 }
318
319 let subdirs = ["bin", "sbin", "lib", "lib64", "usr", "etc", "nix"];
323
324 for subdir in &subdirs {
325 let source = rootfs_path.join(subdir);
326 if !source.exists() {
327 debug!("Rootfs subdir {} not present, skipping", subdir);
328 continue;
329 }
330
331 let target = root.join(subdir);
332 std::fs::create_dir_all(&target).map_err(|e| {
333 NucleusError::FilesystemError(format!(
334 "Failed to create mount point {:?}: {}",
335 target, e
336 ))
337 })?;
338
339 mount(
340 Some(&source),
341 &target,
342 None::<&str>,
343 MsFlags::MS_BIND | MsFlags::MS_REC,
344 None::<&str>,
345 )
346 .map_err(|e| {
347 NucleusError::FilesystemError(format!(
348 "Failed to bind mount rootfs {:?} -> {:?}: {}",
349 source, target, e
350 ))
351 })?;
352
353 mount(
355 None::<&str>,
356 &target,
357 None::<&str>,
358 MsFlags::MS_REMOUNT
359 | MsFlags::MS_BIND
360 | MsFlags::MS_RDONLY
361 | MsFlags::MS_REC
362 | MsFlags::MS_NOSUID
363 | MsFlags::MS_NODEV,
364 None::<&str>,
365 )
366 .map_err(|e| {
367 NucleusError::FilesystemError(format!(
368 "Failed to remount rootfs {:?} read-only: {}",
369 target, e
370 ))
371 })?;
372
373 info!("Mounted rootfs/{} read-only", subdir);
374 }
375
376 Ok(())
377}
378
379pub fn bind_mount_host_paths(root: &Path, best_effort: bool) -> Result<()> {
384 info!("Bind mounting host paths into container");
385
386 let host_paths = vec![
388 "/bin", "/usr", "/lib", "/lib64", "/nix", ];
390
391 for host_path in host_paths {
392 let host = Path::new(host_path);
393
394 if !host.exists() {
396 debug!("Skipping {} (not present on host)", host_path);
397 continue;
398 }
399
400 let container_path = root.join(host_path.trim_start_matches('/'));
401
402 if let Err(e) = std::fs::create_dir_all(&container_path) {
404 if best_effort {
405 warn!("Failed to create mount point {:?}: {}", container_path, e);
406 continue;
407 }
408 return Err(NucleusError::FilesystemError(format!(
409 "Failed to create mount point {:?}: {}",
410 container_path, e
411 )));
412 }
413
414 match mount(
418 Some(host),
419 &container_path,
420 None::<&str>,
421 MsFlags::MS_BIND | MsFlags::MS_REC,
422 None::<&str>,
423 ) {
424 Ok(_) => {
425 mount(
427 None::<&str>,
428 &container_path,
429 None::<&str>,
430 MsFlags::MS_REMOUNT
431 | MsFlags::MS_BIND
432 | MsFlags::MS_RDONLY
433 | MsFlags::MS_REC
434 | MsFlags::MS_NOSUID
435 | MsFlags::MS_NODEV,
436 None::<&str>,
437 )
438 .map_err(|e| {
439 NucleusError::FilesystemError(format!(
440 "Failed to remount {} as read-only: {}",
441 host_path, e
442 ))
443 })?;
444 info!(
445 "Bind mounted {} to {:?} (read-only)",
446 host_path, container_path
447 );
448 }
449 Err(e) => {
450 if best_effort {
451 warn!(
452 "Failed to bind mount {}: {} (continuing anyway)",
453 host_path, e
454 );
455 } else {
456 return Err(NucleusError::FilesystemError(format!(
457 "Failed to bind mount {}: {}",
458 host_path, e
459 )));
460 }
461 }
462 }
463 }
464
465 Ok(())
466}
467
468const DENIED_BIND_MOUNT_SOURCES_EXACT: &[&str] = &[
470 "/",
471 "/etc/shadow",
472 "/etc/sudoers",
473 "/etc/passwd",
474 "/etc/gshadow",
475];
476
477const DENIED_BIND_MOUNT_SOURCE_PREFIXES: &[&str] = &["/proc", "/sys", "/dev", "/boot"];
479
480fn normalize_bind_mount_source_for_policy(source: &Path) -> Result<PathBuf> {
481 if !source.is_absolute() {
482 return Err(NucleusError::ConfigError(format!(
483 "Bind mount source must be absolute: {:?}",
484 source
485 )));
486 }
487
488 let mut normalized = PathBuf::from("/");
489
490 for component in source.components() {
491 match component {
492 Component::RootDir => {}
493 Component::CurDir => {}
494 Component::Normal(part) => normalized.push(part),
495 Component::ParentDir => {
496 normalized.pop();
497 if normalized.as_os_str().is_empty() {
498 normalized.push("/");
499 }
500 }
501 Component::Prefix(_) => {
502 return Err(NucleusError::ConfigError(format!(
503 "Unsupported bind mount source prefix: {:?}",
504 source
505 )));
506 }
507 }
508 }
509
510 Ok(normalized)
511}
512
513fn reject_denied_bind_mount_source(source: &Path) -> Result<()> {
514 for denied in DENIED_BIND_MOUNT_SOURCES_EXACT {
515 if source == Path::new(denied) {
516 return Err(NucleusError::ConfigError(format!(
517 "Bind mount source '{}' is a sensitive host path and cannot be mounted into containers",
518 source.display()
519 )));
520 }
521 }
522
523 for denied in DENIED_BIND_MOUNT_SOURCE_PREFIXES {
524 let denied_path = Path::new(denied);
525 if source == denied_path || source.starts_with(denied_path) {
526 return Err(NucleusError::ConfigError(format!(
527 "Bind mount source '{}' is under sensitive host path '{}' and cannot be mounted into containers",
528 source.display(),
529 denied
530 )));
531 }
532 }
533
534 Ok(())
535}
536
537pub fn validate_bind_mount_source(source: &Path) -> Result<()> {
539 let normalized = normalize_bind_mount_source_for_policy(source)?;
540 reject_denied_bind_mount_source(&normalized)?;
541
542 let canonical = std::fs::canonicalize(source).map_err(|e| {
543 NucleusError::ConfigError(format!(
544 "Failed to resolve bind mount source {:?}: {}",
545 source, e
546 ))
547 })?;
548 reject_denied_bind_mount_source(&canonical)
549}
550
551pub fn mount_volumes(root: &Path, volumes: &[crate::container::VolumeMount]) -> Result<()> {
553 use crate::container::VolumeSource;
554
555 if volumes.is_empty() {
556 return Ok(());
557 }
558
559 info!("Mounting {} volume(s) into container", volumes.len());
560
561 for volume in volumes {
562 let dest = resolve_container_destination(root, &volume.dest)?;
563
564 match &volume.source {
565 VolumeSource::Bind { source } => {
566 validate_bind_mount_source(source)?;
568
569 if std::fs::symlink_metadata(source).is_err() {
572 return Err(NucleusError::FilesystemError(format!(
573 "Volume source does not exist: {:?}",
574 source
575 )));
576 }
577
578 if let Some(parent) = dest.parent() {
579 std::fs::create_dir_all(parent).map_err(|e| {
580 NucleusError::FilesystemError(format!(
581 "Failed to create volume mount parent {:?}: {}",
582 parent, e
583 ))
584 })?;
585 }
586
587 let recursive = source.is_dir();
588 if source.is_file() {
589 std::fs::write(&dest, "").map_err(|e| {
590 NucleusError::FilesystemError(format!(
591 "Failed to create volume mount point {:?}: {}",
592 dest, e
593 ))
594 })?;
595 } else {
596 std::fs::create_dir_all(&dest).map_err(|e| {
597 NucleusError::FilesystemError(format!(
598 "Failed to create volume mount dir {:?}: {}",
599 dest, e
600 ))
601 })?;
602 }
603
604 let initial_flags = if recursive {
605 MsFlags::MS_BIND | MsFlags::MS_REC
606 } else {
607 MsFlags::MS_BIND
608 };
609 mount(
610 Some(source.as_path()),
611 &dest,
612 None::<&str>,
613 initial_flags,
614 None::<&str>,
615 )
616 .map_err(|e| {
617 NucleusError::FilesystemError(format!(
618 "Failed to bind mount volume {:?} -> {:?}: {}",
619 source, dest, e
620 ))
621 })?;
622
623 let mut remount_flags =
624 MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_NOSUID | MsFlags::MS_NODEV;
625 if recursive {
626 remount_flags |= MsFlags::MS_REC;
627 }
628 if volume.read_only {
629 remount_flags |= MsFlags::MS_RDONLY;
630 }
631
632 mount(
633 None::<&str>,
634 &dest,
635 None::<&str>,
636 remount_flags,
637 None::<&str>,
638 )
639 .map_err(|e| {
640 NucleusError::FilesystemError(format!(
641 "Failed to remount volume {:?} with final flags: {}",
642 dest, e
643 ))
644 })?;
645
646 info!(
647 "Mounted bind volume {:?} -> {:?} ({})",
648 source,
649 volume.dest,
650 if volume.read_only { "ro" } else { "rw" }
651 );
652 }
653 VolumeSource::Tmpfs { size } => {
654 std::fs::create_dir_all(&dest).map_err(|e| {
655 NucleusError::FilesystemError(format!(
656 "Failed to create tmpfs mount dir {:?}: {}",
657 dest, e
658 ))
659 })?;
660
661 if let Some(value) = size.as_ref() {
664 let valid = value
665 .chars()
666 .all(|c| c.is_ascii_digit() || "kKmMgG".contains(c));
667 if !valid || value.is_empty() {
668 return Err(NucleusError::FilesystemError(format!(
669 "Invalid tmpfs size value '{}': only digits with optional K/M/G suffix allowed",
670 value
671 )));
672 }
673 }
674
675 let mount_data = size
678 .as_ref()
679 .map(|value| format!("size={},mode=0700", value))
680 .unwrap_or_else(|| "size=64M,mode=0700".to_string());
681
682 let mut flags = MsFlags::MS_NOSUID | MsFlags::MS_NODEV;
683 if volume.read_only {
684 flags |= MsFlags::MS_RDONLY;
685 }
686 mount(
687 Some("tmpfs"),
688 &dest,
689 Some("tmpfs"),
690 flags,
691 Some(mount_data.as_str()),
692 )
693 .map_err(|e| {
694 NucleusError::FilesystemError(format!(
695 "Failed to mount tmpfs volume at {:?}: {}",
696 dest, e
697 ))
698 })?;
699
700 info!(
701 "Mounted tmpfs volume at {:?}{}{}",
702 volume.dest,
703 size.as_ref()
704 .map(|value| format!(" (size={})", value))
705 .unwrap_or_default(),
706 if volume.read_only { " (ro)" } else { "" }
707 );
708 }
709 }
710 }
711
712 Ok(())
713}
714
715pub fn mount_procfs(
721 proc_path: &Path,
722 best_effort: bool,
723 read_only: bool,
724 hide_pids: bool,
725) -> Result<()> {
726 info!(
727 "Mounting procfs at {:?} (hidepid={})",
728 proc_path,
729 if hide_pids { "2" } else { "0" }
730 );
731
732 let mount_data: Option<&str> = if hide_pids { Some("hidepid=2") } else { None };
733
734 match mount(
735 Some("proc"),
736 proc_path,
737 Some("proc"),
738 MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
739 mount_data,
740 ) {
741 Ok(_) => {
742 if read_only {
743 mount(
744 None::<&str>,
745 proc_path,
746 None::<&str>,
747 MsFlags::MS_REMOUNT
748 | MsFlags::MS_RDONLY
749 | MsFlags::MS_NOSUID
750 | MsFlags::MS_NODEV
751 | MsFlags::MS_NOEXEC,
752 None::<&str>,
753 )
754 .map_err(|e| {
755 NucleusError::FilesystemError(format!(
756 "Failed to remount procfs read-only: {}",
757 e
758 ))
759 })?;
760 info!("Successfully mounted procfs (read-only)");
761 } else {
762 info!("Successfully mounted procfs");
763 }
764 Ok(())
765 }
766 Err(e) => {
767 if best_effort {
768 warn!("Failed to mount procfs: {} (continuing anyway)", e);
769 Ok(())
770 } else {
771 Err(NucleusError::FilesystemError(format!(
772 "Failed to mount procfs: {}",
773 e
774 )))
775 }
776 }
777 }
778}
779
780pub const PROC_NULL_MASKED: &[&str] = &[
784 "kallsyms",
785 "kcore",
786 "sched_debug",
787 "timer_list",
788 "timer_stats",
789 "keys",
790 "latency_stats",
791 "config.gz",
792 "sysrq-trigger",
793 "kpagecount",
794 "kpageflags",
795 "kpagecgroup",
796];
797
798pub const PROC_TMPFS_MASKED: &[&str] = &["acpi", "bus", "irq", "scsi", "sys"];
800
801pub fn mask_proc_paths(proc_path: &Path, production: bool) -> Result<()> {
809 info!("Masking sensitive /proc paths");
810
811 const CRITICAL_PROC_PATHS: &[&str] = &["kcore", "kallsyms", "sysrq-trigger"];
812
813 let dev_null = Path::new("/dev/null");
814
815 for name in PROC_NULL_MASKED {
816 let target = proc_path.join(name);
817 if !target.exists() {
818 continue;
819 }
820 match mount(
821 Some(dev_null),
822 &target,
823 None::<&str>,
824 MsFlags::MS_BIND,
825 None::<&str>,
826 ) {
827 Ok(_) => {
828 if let Err(e) = mount(
831 None::<&str>,
832 &target,
833 None::<&str>,
834 MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_RDONLY,
835 None::<&str>,
836 ) {
837 if production && CRITICAL_PROC_PATHS.contains(name) {
838 return Err(NucleusError::FilesystemError(format!(
839 "Failed to remount /proc/{} read-only in production mode: {}",
840 name, e
841 )));
842 }
843 warn!(
844 "Failed to remount /proc/{} read-only: {} (continuing)",
845 name, e
846 );
847 }
848 debug!("Masked /proc/{} (read-only)", name);
849 }
850 Err(e) => {
851 if production && CRITICAL_PROC_PATHS.contains(name) {
852 return Err(NucleusError::FilesystemError(format!(
853 "Failed to mask critical /proc/{} in production mode: {}",
854 name, e
855 )));
856 }
857 warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
858 }
859 }
860 }
861
862 for name in PROC_TMPFS_MASKED {
863 let target = proc_path.join(name);
864 if !target.exists() {
865 continue;
866 }
867 match mount(
868 Some("tmpfs"),
869 &target,
870 Some("tmpfs"),
871 MsFlags::MS_RDONLY | MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
872 Some("size=0"),
873 ) {
874 Ok(_) => debug!("Masked /proc/{}", name),
875 Err(e) => {
876 if production {
877 return Err(NucleusError::FilesystemError(format!(
878 "Failed to mask /proc/{} in production mode: {}",
879 name, e
880 )));
881 }
882 warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
883 }
884 }
885 }
886
887 info!("Finished masking sensitive /proc paths");
888 Ok(())
889}
890
891pub fn switch_root(new_root: &Path, allow_chroot_fallback: bool) -> Result<()> {
896 info!("Switching root to {:?}", new_root);
897
898 match pivot_root_impl(new_root) {
899 Ok(()) => {
900 info!("Successfully switched root using pivot_root");
901 Ok(())
902 }
903 Err(e) => {
904 if allow_chroot_fallback {
905 warn!(
906 "pivot_root failed ({}), falling back to chroot due to explicit \
907 configuration",
908 e
909 );
910 chroot_impl(new_root)
911 } else {
912 Err(NucleusError::PivotRootError(format!(
913 "pivot_root failed: {}. chroot fallback is disabled by default; use \
914 --allow-chroot-fallback to allow weaker isolation",
915 e
916 )))
917 }
918 }
919 }
920}
921
922fn pivot_root_impl(new_root: &Path) -> Result<()> {
928 use nix::unistd::pivot_root;
929
930 let old_root = new_root.join(".old_root");
934 std::fs::create_dir_all(&old_root).map_err(|e| {
935 NucleusError::PivotRootError(format!("Failed to create old_root directory: {}", e))
936 })?;
937
938 pivot_root(new_root, &old_root)
940 .map_err(|e| NucleusError::PivotRootError(format!("pivot_root syscall failed: {}", e)))?;
941
942 std::env::set_current_dir("/")
944 .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
945
946 nix::mount::umount2("/.old_root", nix::mount::MntFlags::MNT_DETACH)
948 .map_err(|e| NucleusError::PivotRootError(format!("Failed to unmount old root: {}", e)))?;
949
950 let _ = std::fs::remove_dir("/.old_root");
952
953 Ok(())
954}
955
956fn chroot_impl(new_root: &Path) -> Result<()> {
960 chroot(new_root)
961 .map_err(|e| NucleusError::PivotRootError(format!("chroot syscall failed: {}", e)))?;
962
963 std::env::set_current_dir("/")
965 .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
966
967 if let Err(e) = caps::drop(
970 None,
971 caps::CapSet::Bounding,
972 caps::Capability::CAP_SYS_CHROOT,
973 ) {
974 debug!(
975 "Could not drop CAP_SYS_CHROOT after chroot: {} (may not be present)",
976 e
977 );
978 }
979 if let Err(e) = caps::drop(
980 None,
981 caps::CapSet::Effective,
982 caps::Capability::CAP_SYS_CHROOT,
983 ) {
984 debug!(
985 "Could not drop effective CAP_SYS_CHROOT: {} (may not be present)",
986 e
987 );
988 }
989 if let Err(e) = caps::drop(
990 None,
991 caps::CapSet::Permitted,
992 caps::Capability::CAP_SYS_CHROOT,
993 ) {
994 debug!(
995 "Could not drop permitted CAP_SYS_CHROOT: {} (may not be present)",
996 e
997 );
998 }
999
1000 info!("Successfully switched root using chroot (CAP_SYS_CHROOT dropped)");
1001
1002 Ok(())
1003}
1004
1005pub fn mount_secrets(root: &Path, secrets: &[crate::container::SecretMount]) -> Result<()> {
1010 if secrets.is_empty() {
1011 return Ok(());
1012 }
1013
1014 info!("Mounting {} secret(s) into container", secrets.len());
1015
1016 for secret in secrets {
1017 let source_fd = open(
1018 &secret.source,
1019 OFlag::O_PATH | OFlag::O_NOFOLLOW | OFlag::O_CLOEXEC,
1020 Mode::empty(),
1021 )
1022 .map_err(|e| {
1023 NucleusError::FilesystemError(format!(
1024 "Failed to open secret source {:?} with O_NOFOLLOW: {}",
1025 secret.source, e
1026 ))
1027 })?;
1028 let source_stat = fstat(&source_fd).map_err(|e| {
1029 NucleusError::FilesystemError(format!(
1030 "Failed to stat secret source {:?}: {}",
1031 secret.source, e
1032 ))
1033 })?;
1034 let source_kind = SFlag::from_bits_truncate(source_stat.st_mode);
1035 let source_is_file = source_kind == SFlag::S_IFREG;
1036 let source_is_dir = source_kind == SFlag::S_IFDIR;
1037 if !source_is_file && !source_is_dir {
1038 return Err(NucleusError::FilesystemError(format!(
1039 "Secret source {:?} must be a regular file or directory",
1040 secret.source
1041 )));
1042 }
1043 let source_fd_path = PathBuf::from(format!("/proc/self/fd/{}", source_fd.as_raw_fd()));
1044
1045 let dest = resolve_container_destination(root, &secret.dest)?;
1047
1048 if let Some(parent) = dest.parent() {
1050 std::fs::create_dir_all(parent).map_err(|e| {
1051 NucleusError::FilesystemError(format!(
1052 "Failed to create secret mount parent {:?}: {}",
1053 parent, e
1054 ))
1055 })?;
1056 }
1057
1058 if source_is_file {
1060 std::fs::write(&dest, "").map_err(|e| {
1061 NucleusError::FilesystemError(format!(
1062 "Failed to create secret mount point {:?}: {}",
1063 dest, e
1064 ))
1065 })?;
1066 } else {
1067 std::fs::create_dir_all(&dest).map_err(|e| {
1068 NucleusError::FilesystemError(format!(
1069 "Failed to create secret mount dir {:?}: {}",
1070 dest, e
1071 ))
1072 })?;
1073 }
1074
1075 mount(
1077 Some(source_fd_path.as_path()),
1078 &dest,
1079 None::<&str>,
1080 MsFlags::MS_BIND,
1081 None::<&str>,
1082 )
1083 .map_err(|e| {
1084 NucleusError::FilesystemError(format!(
1085 "Failed to bind mount secret {:?}: {}",
1086 secret.source, e
1087 ))
1088 })?;
1089
1090 mount(
1091 None::<&str>,
1092 &dest,
1093 None::<&str>,
1094 MsFlags::MS_REMOUNT
1095 | MsFlags::MS_BIND
1096 | MsFlags::MS_RDONLY
1097 | MsFlags::MS_NOSUID
1098 | MsFlags::MS_NODEV
1099 | MsFlags::MS_NOEXEC,
1100 None::<&str>,
1101 )
1102 .map_err(|e| {
1103 NucleusError::FilesystemError(format!(
1104 "Failed to remount secret {:?} read-only: {}",
1105 dest, e
1106 ))
1107 })?;
1108
1109 if source_is_file {
1111 use std::os::unix::fs::PermissionsExt;
1112 let perms = std::fs::Permissions::from_mode(secret.mode);
1113 if let Err(e) = std::fs::set_permissions(&dest, perms) {
1114 warn!(
1115 "Failed to set mode {:04o} on secret {:?}: {} (bind mount may override)",
1116 secret.mode, dest, e
1117 );
1118 }
1119 }
1120
1121 debug!(
1122 "Mounted secret {:?} -> {:?} (mode {:04o})",
1123 secret.source, secret.dest, secret.mode
1124 );
1125 }
1126
1127 Ok(())
1128}
1129
1130pub fn mount_secrets_inmemory(
1136 root: &Path,
1137 secrets: &[crate::container::SecretMount],
1138 identity: &crate::container::ProcessIdentity,
1139) -> Result<()> {
1140 if secrets.is_empty() {
1141 return Ok(());
1142 }
1143
1144 info!("Mounting {} secret(s) on in-memory tmpfs", secrets.len());
1145
1146 let secrets_dir = root.join("run/secrets");
1147 std::fs::create_dir_all(&secrets_dir).map_err(|e| {
1148 NucleusError::FilesystemError(format!(
1149 "Failed to create secrets dir {:?}: {}",
1150 secrets_dir, e
1151 ))
1152 })?;
1153
1154 if let Err(e) = mount(
1156 Some("tmpfs"),
1157 &secrets_dir,
1158 Some("tmpfs"),
1159 MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
1160 Some("size=16m,mode=0700"),
1161 ) {
1162 let _ = std::fs::remove_dir_all(&secrets_dir);
1163 return Err(NucleusError::FilesystemError(format!(
1164 "Failed to mount secrets tmpfs at {:?}: {}",
1165 secrets_dir, e
1166 )));
1167 }
1168
1169 if !identity.is_root() {
1170 nix::unistd::chown(
1171 &secrets_dir,
1172 Some(nix::unistd::Uid::from_raw(identity.uid)),
1173 Some(nix::unistd::Gid::from_raw(identity.gid)),
1174 )
1175 .map_err(|e| {
1176 let _ = nix::mount::umount2(&secrets_dir, nix::mount::MntFlags::MNT_DETACH);
1177 let _ = std::fs::remove_dir_all(&secrets_dir);
1178 NucleusError::FilesystemError(format!(
1179 "Failed to set /run/secrets owner to {}:{}: {}",
1180 identity.uid, identity.gid, e
1181 ))
1182 })?;
1183 }
1184
1185 let result = mount_secrets_inmemory_inner(&secrets_dir, root, secrets, identity);
1187 if let Err(ref e) = result {
1188 let _ = nix::mount::umount2(&secrets_dir, nix::mount::MntFlags::MNT_DETACH);
1189 let _ = std::fs::remove_dir_all(&secrets_dir);
1190 return Err(NucleusError::FilesystemError(format!(
1191 "Secret mount failed (rolled back): {}",
1192 e
1193 )));
1194 }
1195
1196 info!("All secrets mounted on in-memory tmpfs");
1197 Ok(())
1198}
1199
1200fn mount_secrets_inmemory_inner(
1201 secrets_dir: &Path,
1202 root: &Path,
1203 secrets: &[crate::container::SecretMount],
1204 identity: &crate::container::ProcessIdentity,
1205) -> Result<()> {
1206 for secret in secrets {
1207 let mut content = read_regular_file_nofollow(&secret.source)?;
1208
1209 let dest = resolve_container_destination(secrets_dir, &secret.dest)?;
1211
1212 if let Some(parent) = dest.parent() {
1214 std::fs::create_dir_all(parent).map_err(|e| {
1215 NucleusError::FilesystemError(format!(
1216 "Failed to create secret parent dir {:?}: {}",
1217 parent, e
1218 ))
1219 })?;
1220 }
1221
1222 std::fs::write(&dest, &content).map_err(|e| {
1224 NucleusError::FilesystemError(format!("Failed to write secret to {:?}: {}", dest, e))
1225 })?;
1226
1227 {
1229 use std::os::unix::fs::PermissionsExt;
1230 let perms = std::fs::Permissions::from_mode(secret.mode);
1231 std::fs::set_permissions(&dest, perms).map_err(|e| {
1232 NucleusError::FilesystemError(format!(
1233 "Failed to set permissions on secret {:?}: {}",
1234 dest, e
1235 ))
1236 })?;
1237 }
1238
1239 if !identity.is_root() {
1240 nix::unistd::chown(
1241 &dest,
1242 Some(nix::unistd::Uid::from_raw(identity.uid)),
1243 Some(nix::unistd::Gid::from_raw(identity.gid)),
1244 )
1245 .map_err(|e| {
1246 NucleusError::FilesystemError(format!(
1247 "Failed to set permissions owner on secret {:?} to {}:{}: {}",
1248 dest, identity.uid, identity.gid, e
1249 ))
1250 })?;
1251 }
1252
1253 zeroize::Zeroize::zeroize(&mut content);
1255 drop(content);
1256
1257 let container_dest = resolve_container_destination(root, &secret.dest)?;
1259 if container_dest != dest {
1260 if let Some(parent) = container_dest.parent() {
1261 std::fs::create_dir_all(parent).map_err(|e| {
1262 NucleusError::FilesystemError(format!(
1263 "Failed to create secret mount parent {:?}: {}",
1264 parent, e
1265 ))
1266 })?;
1267 }
1268
1269 std::fs::write(&container_dest, "").map_err(|e| {
1270 NucleusError::FilesystemError(format!(
1271 "Failed to create secret mount point {:?}: {}",
1272 container_dest, e
1273 ))
1274 })?;
1275
1276 mount(
1277 Some(dest.as_path()),
1278 &container_dest,
1279 None::<&str>,
1280 MsFlags::MS_BIND,
1281 None::<&str>,
1282 )
1283 .map_err(|e| {
1284 NucleusError::FilesystemError(format!(
1285 "Failed to bind mount secret {:?} -> {:?}: {}",
1286 dest, container_dest, e
1287 ))
1288 })?;
1289
1290 mount(
1291 None::<&str>,
1292 &container_dest,
1293 None::<&str>,
1294 MsFlags::MS_REMOUNT
1295 | MsFlags::MS_BIND
1296 | MsFlags::MS_RDONLY
1297 | MsFlags::MS_NOSUID
1298 | MsFlags::MS_NODEV
1299 | MsFlags::MS_NOEXEC,
1300 None::<&str>,
1301 )
1302 .map_err(|e| {
1303 NucleusError::FilesystemError(format!(
1304 "Failed to remount secret {:?} read-only: {}",
1305 container_dest, e
1306 ))
1307 })?;
1308 }
1309
1310 debug!(
1311 "Secret {:?} -> {:?} (in-memory tmpfs, mode {:04o})",
1312 secret.source, secret.dest, secret.mode
1313 );
1314 }
1315
1316 Ok(())
1317}
1318
1319#[cfg(test)]
1320mod tests {
1321 use super::*;
1322 use std::os::unix::fs::symlink;
1323
1324 #[test]
1325 fn test_validate_bind_mount_source_rejects_sensitive_subtrees() {
1326 for path in ["/proc/sys", "/sys/fs/cgroup", "/dev/kmsg", "/boot"] {
1327 let err = validate_bind_mount_source(Path::new(path)).unwrap_err();
1328 assert!(
1329 err.to_string().contains("sensitive host path"),
1330 "expected sensitive-path rejection for {path}, got: {err}"
1331 );
1332 }
1333 }
1334
1335 #[test]
1336 fn test_validate_bind_mount_source_allows_regular_host_paths() {
1337 let temp = tempfile::TempDir::new().unwrap();
1338 let safe_path = temp.path().join("data");
1339 std::fs::create_dir(&safe_path).unwrap();
1340
1341 validate_bind_mount_source(&safe_path).unwrap();
1342 }
1343
1344 #[test]
1345 fn test_validate_bind_mount_source_normalizes_parent_components_before_filtering() {
1346 let temp = tempfile::TempDir::new().unwrap();
1347 let safe_path = temp.path().join("data");
1348 std::fs::create_dir(&safe_path).unwrap();
1349
1350 validate_bind_mount_source(&safe_path.join("../data")).unwrap();
1351 }
1352
1353 #[test]
1354 fn test_proc_mask_includes_sysrq_trigger() {
1355 assert!(
1356 PROC_NULL_MASKED.contains(&"sysrq-trigger"),
1357 "/proc/sysrq-trigger must be masked to prevent host DoS"
1358 );
1359 }
1360
1361 #[test]
1362 fn test_proc_mask_includes_timer_stats() {
1363 assert!(
1364 PROC_NULL_MASKED.contains(&"timer_stats"),
1365 "/proc/timer_stats must be masked to prevent kernel info leakage"
1366 );
1367 }
1368
1369 #[test]
1370 fn test_proc_mask_includes_kpage_files() {
1371 for path in &["kpagecount", "kpageflags", "kpagecgroup"] {
1372 assert!(
1373 PROC_NULL_MASKED.contains(path),
1374 "/proc/{} must be masked to prevent host memory layout leakage",
1375 path
1376 );
1377 }
1378 }
1379
1380 #[test]
1381 fn test_proc_mask_includes_oci_standard_paths() {
1382 for path in &["kallsyms", "kcore", "sched_debug", "keys", "config.gz"] {
1384 assert!(
1385 PROC_NULL_MASKED.contains(path),
1386 "/proc/{} must be in null-masked list (OCI spec)",
1387 path
1388 );
1389 }
1390 for path in &["acpi", "bus", "scsi", "sys"] {
1391 assert!(
1392 PROC_TMPFS_MASKED.contains(path),
1393 "/proc/{} must be in tmpfs-masked list (OCI spec)",
1394 path
1395 );
1396 }
1397 }
1398
1399 #[test]
1400 fn test_read_regular_file_nofollow_reads_regular_file() {
1401 let temp = tempfile::tempdir().unwrap();
1402 let path = temp.path().join("secret.txt");
1403 std::fs::write(&path, "supersecret").unwrap();
1404
1405 let content = read_regular_file_nofollow(&path).unwrap();
1406 assert_eq!(content, b"supersecret");
1407 }
1408
1409 #[test]
1410 fn test_read_regular_file_nofollow_rejects_symlink() {
1411 let temp = tempfile::tempdir().unwrap();
1412 let target = temp.path().join("target.txt");
1413 let link = temp.path().join("secret-link");
1414 std::fs::write(&target, "supersecret").unwrap();
1415 symlink(&target, &link).unwrap();
1416
1417 let err = read_regular_file_nofollow(&link).unwrap_err();
1418 assert!(
1419 err.to_string().contains("O_NOFOLLOW"),
1420 "symlink reads must fail via O_NOFOLLOW"
1421 );
1422 }
1423}