1use crate::error::{NucleusError, Result};
2use nix::fcntl::{open, OFlag};
3use nix::mount::{mount, MsFlags};
4use nix::sys::stat::{fstat, makedev, mknod, Mode, SFlag};
5use nix::unistd::chroot;
6use std::fs::OpenOptions;
7use std::io::Read;
8use std::os::fd::AsRawFd;
9use std::os::unix::fs::OpenOptionsExt;
10use std::path::{Component, Path, PathBuf};
11use tracing::{debug, info, warn};
12
13struct ExpectedMount {
15 path: &'static str,
16 required_flags: &'static [&'static str],
17 critical: bool,
20}
21
22const PRODUCTION_MOUNT_EXPECTATIONS: &[ExpectedMount] = &[
24 ExpectedMount {
25 path: "/bin",
26 required_flags: &["ro", "nosuid", "nodev"],
27 critical: true,
28 },
29 ExpectedMount {
30 path: "/usr",
31 required_flags: &["ro", "nosuid", "nodev"],
32 critical: true,
33 },
34 ExpectedMount {
35 path: "/lib",
36 required_flags: &["ro", "nosuid", "nodev"],
37 critical: false, },
39 ExpectedMount {
40 path: "/lib64",
41 required_flags: &["ro", "nosuid", "nodev"],
42 critical: false, },
44 ExpectedMount {
45 path: "/etc",
46 required_flags: &["ro", "nosuid", "nodev"],
47 critical: true,
48 },
49 ExpectedMount {
50 path: "/nix",
51 required_flags: &["ro", "nosuid", "nodev"],
52 critical: false, },
54 ExpectedMount {
55 path: "/sbin",
56 required_flags: &["ro", "nosuid", "nodev"],
57 critical: false, },
59 ExpectedMount {
60 path: "/proc",
61 required_flags: &["nosuid", "nodev", "noexec"],
62 critical: true,
63 },
64 ExpectedMount {
65 path: "/run/secrets",
66 required_flags: &["nosuid", "nodev", "noexec"],
67 critical: false, },
69];
70
71pub fn normalize_container_destination(dest: &Path) -> Result<PathBuf> {
76 if !dest.is_absolute() {
77 return Err(NucleusError::ConfigError(format!(
78 "Container destination must be absolute: {:?}",
79 dest
80 )));
81 }
82
83 let mut normalized = PathBuf::from("/");
84 let mut saw_component = false;
85
86 for component in dest.components() {
87 match component {
88 Component::RootDir => {}
89 Component::CurDir => {}
90 Component::Normal(part) => {
91 normalized.push(part);
92 saw_component = true;
93 }
94 Component::ParentDir => {
95 return Err(NucleusError::ConfigError(format!(
96 "Container destination must not contain parent traversal: {:?}",
97 dest
98 )));
99 }
100 Component::Prefix(_) => {
101 return Err(NucleusError::ConfigError(format!(
102 "Unsupported container destination prefix: {:?}",
103 dest
104 )));
105 }
106 }
107 }
108
109 if !saw_component {
110 return Err(NucleusError::ConfigError(format!(
111 "Container destination must not be the root directory: {:?}",
112 dest
113 )));
114 }
115
116 Ok(normalized)
117}
118
119pub fn resolve_container_destination(root: &Path, dest: &Path) -> Result<PathBuf> {
121 let normalized = normalize_container_destination(dest)?;
122 let relative = normalized.strip_prefix("/").map_err(|_| {
123 NucleusError::ConfigError(format!(
124 "Container destination is not absolute after normalization: {:?}",
125 normalized
126 ))
127 })?;
128 Ok(root.join(relative))
129}
130
131pub(crate) fn read_regular_file_nofollow(path: &Path) -> Result<Vec<u8>> {
132 let mut file = OpenOptions::new()
133 .read(true)
134 .custom_flags(libc::O_NOFOLLOW | libc::O_CLOEXEC)
135 .open(path)
136 .map_err(|e| {
137 NucleusError::FilesystemError(format!(
138 "Failed to open file {:?} with O_NOFOLLOW: {}",
139 path, e
140 ))
141 })?;
142
143 let metadata = file.metadata().map_err(|e| {
144 NucleusError::FilesystemError(format!("Failed to stat file {:?}: {}", path, e))
145 })?;
146 if !metadata.is_file() {
147 return Err(NucleusError::FilesystemError(format!(
148 "Expected regular file for {:?}, found non-file source",
149 path
150 )));
151 }
152
153 let mut content = Vec::new();
154 file.read_to_end(&mut content).map_err(|e| {
155 NucleusError::FilesystemError(format!("Failed to read file {:?}: {}", path, e))
156 })?;
157 Ok(content)
158}
159
160pub fn audit_mounts(production_mode: bool) -> Result<()> {
166 let mounts_content = std::fs::read_to_string("/proc/self/mounts").map_err(|e| {
167 NucleusError::FilesystemError(format!("Failed to read /proc/self/mounts: {}", e))
168 })?;
169
170 let mut violations = Vec::new();
171
172 for expectation in PRODUCTION_MOUNT_EXPECTATIONS {
173 let mount_entry = mounts_content.lines().find(|line| {
175 let parts: Vec<&str> = line.split_whitespace().collect();
176 parts.len() >= 4 && parts[1] == expectation.path
177 });
178
179 if let Some(entry) = mount_entry {
180 let parts: Vec<&str> = entry.split_whitespace().collect();
181 if parts.len() >= 4 {
182 let options = parts[3];
183 for &flag in expectation.required_flags {
184 if !options.split(',').any(|opt| opt == flag) {
185 violations.push(format!(
186 "Mount {} missing required flag '{}' (has: {})",
187 expectation.path, flag, options
188 ));
189 }
190 }
191 }
192 } else if expectation.critical && production_mode {
193 violations.push(format!(
194 "Critical mount {} is missing from the mount namespace",
195 expectation.path
196 ));
197 }
198 }
199
200 if violations.is_empty() {
201 info!("Mount audit passed: all expected flags verified");
202 Ok(())
203 } else if production_mode {
204 Err(NucleusError::FilesystemError(format!(
205 "Mount audit failed in production mode:\n {}",
206 violations.join("\n ")
207 )))
208 } else {
209 for v in &violations {
210 warn!("Mount audit: {}", v);
211 }
212 Ok(())
213 }
214}
215
216pub fn create_minimal_fs(root: &Path) -> Result<()> {
218 info!("Creating minimal filesystem structure at {:?}", root);
219
220 let dirs = vec![
222 "dev",
223 "proc",
224 "sys",
225 "tmp",
226 "bin",
227 "sbin",
228 "usr",
229 "lib",
230 "lib64",
231 "etc",
232 "nix",
233 "nix/store",
234 "run",
235 "context",
236 ];
237
238 for dir in dirs {
239 let path = root.join(dir);
240 std::fs::create_dir_all(&path).map_err(|e| {
241 NucleusError::FilesystemError(format!("Failed to create directory {:?}: {}", path, e))
242 })?;
243 }
244
245 info!("Created minimal filesystem structure");
246
247 Ok(())
248}
249
250pub fn create_dev_nodes(dev_path: &Path, include_tty: bool) -> Result<()> {
254 info!("Creating device nodes at {:?}", dev_path);
255
256 let mut devices = vec![
258 ("null", SFlag::S_IFCHR, 1, 3),
259 ("zero", SFlag::S_IFCHR, 1, 5),
260 ("full", SFlag::S_IFCHR, 1, 7),
261 ("random", SFlag::S_IFCHR, 1, 8),
262 ("urandom", SFlag::S_IFCHR, 1, 9),
263 ];
264 if include_tty {
265 devices.push(("tty", SFlag::S_IFCHR, 5, 0));
266 }
267
268 let mut created_count = 0;
269 let mut failed_count = 0;
270
271 for (name, dev_type, major, minor) in devices {
272 let path = dev_path.join(name);
273 let mode = Mode::from_bits_truncate(0o660);
274 let dev = makedev(major, minor);
275
276 match mknod(&path, dev_type, mode, dev) {
277 Ok(_) => {
278 info!("Created device node: {:?}", path);
279 created_count += 1;
280 }
281 Err(e) => {
282 warn!(
284 "Failed to create device node {:?}: {} (this is normal in rootless mode)",
285 path, e
286 );
287 failed_count += 1;
288 }
289 }
290 }
291
292 if created_count > 0 {
293 info!("Successfully created {} device nodes", created_count);
294 }
295 if failed_count > 0 {
296 info!("Skipped {} device nodes (rootless mode)", failed_count);
297 }
298
299 Ok(())
300}
301
302pub fn bind_mount_rootfs(root: &Path, rootfs_path: &Path) -> Result<()> {
307 info!(
308 "Bind mounting production rootfs {:?} into container {:?}",
309 rootfs_path, root
310 );
311
312 if std::fs::symlink_metadata(rootfs_path).is_err() {
313 return Err(NucleusError::FilesystemError(format!(
314 "Rootfs path does not exist: {:?}",
315 rootfs_path
316 )));
317 }
318
319 let subdirs = ["bin", "sbin", "lib", "lib64", "usr", "etc", "nix"];
323
324 for subdir in &subdirs {
325 let source = rootfs_path.join(subdir);
326 if !source.exists() {
327 debug!("Rootfs subdir {} not present, skipping", subdir);
328 continue;
329 }
330
331 let target = root.join(subdir);
332 std::fs::create_dir_all(&target).map_err(|e| {
333 NucleusError::FilesystemError(format!(
334 "Failed to create mount point {:?}: {}",
335 target, e
336 ))
337 })?;
338
339 mount(
340 Some(&source),
341 &target,
342 None::<&str>,
343 MsFlags::MS_BIND | MsFlags::MS_REC,
344 None::<&str>,
345 )
346 .map_err(|e| {
347 NucleusError::FilesystemError(format!(
348 "Failed to bind mount rootfs {:?} -> {:?}: {}",
349 source, target, e
350 ))
351 })?;
352
353 mount(
355 None::<&str>,
356 &target,
357 None::<&str>,
358 MsFlags::MS_REMOUNT
359 | MsFlags::MS_BIND
360 | MsFlags::MS_RDONLY
361 | MsFlags::MS_REC
362 | MsFlags::MS_NOSUID
363 | MsFlags::MS_NODEV,
364 None::<&str>,
365 )
366 .map_err(|e| {
367 NucleusError::FilesystemError(format!(
368 "Failed to remount rootfs {:?} read-only: {}",
369 target, e
370 ))
371 })?;
372
373 info!("Mounted rootfs/{} read-only", subdir);
374 }
375
376 Ok(())
377}
378
379pub fn bind_mount_host_paths(root: &Path, best_effort: bool) -> Result<()> {
384 info!("Bind mounting host paths into container");
385
386 let host_paths = vec![
388 "/bin", "/usr", "/lib", "/lib64", "/nix", ];
390
391 for host_path in host_paths {
392 let host = Path::new(host_path);
393
394 if !host.exists() {
396 debug!("Skipping {} (not present on host)", host_path);
397 continue;
398 }
399
400 let container_path = root.join(host_path.trim_start_matches('/'));
401
402 if let Err(e) = std::fs::create_dir_all(&container_path) {
404 if best_effort {
405 warn!("Failed to create mount point {:?}: {}", container_path, e);
406 continue;
407 }
408 return Err(NucleusError::FilesystemError(format!(
409 "Failed to create mount point {:?}: {}",
410 container_path, e
411 )));
412 }
413
414 match mount(
418 Some(host),
419 &container_path,
420 None::<&str>,
421 MsFlags::MS_BIND | MsFlags::MS_REC,
422 None::<&str>,
423 ) {
424 Ok(_) => {
425 mount(
427 None::<&str>,
428 &container_path,
429 None::<&str>,
430 MsFlags::MS_REMOUNT
431 | MsFlags::MS_BIND
432 | MsFlags::MS_RDONLY
433 | MsFlags::MS_REC
434 | MsFlags::MS_NOSUID
435 | MsFlags::MS_NODEV,
436 None::<&str>,
437 )
438 .map_err(|e| {
439 NucleusError::FilesystemError(format!(
440 "Failed to remount {} as read-only: {}",
441 host_path, e
442 ))
443 })?;
444 info!(
445 "Bind mounted {} to {:?} (read-only)",
446 host_path, container_path
447 );
448 }
449 Err(e) => {
450 if best_effort {
451 warn!(
452 "Failed to bind mount {}: {} (continuing anyway)",
453 host_path, e
454 );
455 } else {
456 return Err(NucleusError::FilesystemError(format!(
457 "Failed to bind mount {}: {}",
458 host_path, e
459 )));
460 }
461 }
462 }
463 }
464
465 Ok(())
466}
467
468const DENIED_BIND_MOUNT_SOURCES: &[&str] = &[
470 "/",
471 "/proc",
472 "/sys",
473 "/dev",
474 "/boot",
475 "/etc/shadow",
476 "/etc/sudoers",
477 "/etc/passwd",
478 "/etc/gshadow",
479];
480
481fn validate_bind_mount_source(source: &Path) -> Result<()> {
483 let source_str = source.to_string_lossy();
484 for denied in DENIED_BIND_MOUNT_SOURCES {
485 if source_str == *denied {
486 return Err(NucleusError::FilesystemError(format!(
487 "Bind mount source '{}' is a sensitive host path and cannot be mounted into containers",
488 source.display()
489 )));
490 }
491 }
492 Ok(())
493}
494
495pub fn mount_volumes(root: &Path, volumes: &[crate::container::VolumeMount]) -> Result<()> {
497 use crate::container::VolumeSource;
498
499 if volumes.is_empty() {
500 return Ok(());
501 }
502
503 info!("Mounting {} volume(s) into container", volumes.len());
504
505 for volume in volumes {
506 let dest = resolve_container_destination(root, &volume.dest)?;
507
508 match &volume.source {
509 VolumeSource::Bind { source } => {
510 validate_bind_mount_source(source)?;
512
513 if std::fs::symlink_metadata(source).is_err() {
516 return Err(NucleusError::FilesystemError(format!(
517 "Volume source does not exist: {:?}",
518 source
519 )));
520 }
521
522 if let Some(parent) = dest.parent() {
523 std::fs::create_dir_all(parent).map_err(|e| {
524 NucleusError::FilesystemError(format!(
525 "Failed to create volume mount parent {:?}: {}",
526 parent, e
527 ))
528 })?;
529 }
530
531 let recursive = source.is_dir();
532 if source.is_file() {
533 std::fs::write(&dest, "").map_err(|e| {
534 NucleusError::FilesystemError(format!(
535 "Failed to create volume mount point {:?}: {}",
536 dest, e
537 ))
538 })?;
539 } else {
540 std::fs::create_dir_all(&dest).map_err(|e| {
541 NucleusError::FilesystemError(format!(
542 "Failed to create volume mount dir {:?}: {}",
543 dest, e
544 ))
545 })?;
546 }
547
548 let initial_flags = if recursive {
549 MsFlags::MS_BIND | MsFlags::MS_REC
550 } else {
551 MsFlags::MS_BIND
552 };
553 mount(
554 Some(source.as_path()),
555 &dest,
556 None::<&str>,
557 initial_flags,
558 None::<&str>,
559 )
560 .map_err(|e| {
561 NucleusError::FilesystemError(format!(
562 "Failed to bind mount volume {:?} -> {:?}: {}",
563 source, dest, e
564 ))
565 })?;
566
567 let mut remount_flags =
568 MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_NOSUID | MsFlags::MS_NODEV;
569 if recursive {
570 remount_flags |= MsFlags::MS_REC;
571 }
572 if volume.read_only {
573 remount_flags |= MsFlags::MS_RDONLY;
574 }
575
576 mount(
577 None::<&str>,
578 &dest,
579 None::<&str>,
580 remount_flags,
581 None::<&str>,
582 )
583 .map_err(|e| {
584 NucleusError::FilesystemError(format!(
585 "Failed to remount volume {:?} with final flags: {}",
586 dest, e
587 ))
588 })?;
589
590 info!(
591 "Mounted bind volume {:?} -> {:?} ({})",
592 source,
593 volume.dest,
594 if volume.read_only { "ro" } else { "rw" }
595 );
596 }
597 VolumeSource::Tmpfs { size } => {
598 std::fs::create_dir_all(&dest).map_err(|e| {
599 NucleusError::FilesystemError(format!(
600 "Failed to create tmpfs mount dir {:?}: {}",
601 dest, e
602 ))
603 })?;
604
605 if let Some(value) = size.as_ref() {
608 let valid = value
609 .chars()
610 .all(|c| c.is_ascii_digit() || "kKmMgG".contains(c));
611 if !valid || value.is_empty() {
612 return Err(NucleusError::FilesystemError(format!(
613 "Invalid tmpfs size value '{}': only digits with optional K/M/G suffix allowed",
614 value
615 )));
616 }
617 }
618
619 let mount_data = size
622 .as_ref()
623 .map(|value| format!("size={},mode=0700", value))
624 .unwrap_or_else(|| "size=64M,mode=0700".to_string());
625
626 let mut flags = MsFlags::MS_NOSUID | MsFlags::MS_NODEV;
627 if volume.read_only {
628 flags |= MsFlags::MS_RDONLY;
629 }
630 mount(
631 Some("tmpfs"),
632 &dest,
633 Some("tmpfs"),
634 flags,
635 Some(mount_data.as_str()),
636 )
637 .map_err(|e| {
638 NucleusError::FilesystemError(format!(
639 "Failed to mount tmpfs volume at {:?}: {}",
640 dest, e
641 ))
642 })?;
643
644 info!(
645 "Mounted tmpfs volume at {:?}{}{}",
646 volume.dest,
647 size.as_ref()
648 .map(|value| format!(" (size={})", value))
649 .unwrap_or_default(),
650 if volume.read_only { " (ro)" } else { "" }
651 );
652 }
653 }
654 }
655
656 Ok(())
657}
658
659pub fn mount_procfs(
665 proc_path: &Path,
666 best_effort: bool,
667 read_only: bool,
668 hide_pids: bool,
669) -> Result<()> {
670 info!(
671 "Mounting procfs at {:?} (hidepid={})",
672 proc_path,
673 if hide_pids { "2" } else { "0" }
674 );
675
676 let mount_data: Option<&str> = if hide_pids { Some("hidepid=2") } else { None };
677
678 match mount(
679 Some("proc"),
680 proc_path,
681 Some("proc"),
682 MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
683 mount_data,
684 ) {
685 Ok(_) => {
686 if read_only {
687 mount(
688 None::<&str>,
689 proc_path,
690 None::<&str>,
691 MsFlags::MS_REMOUNT
692 | MsFlags::MS_RDONLY
693 | MsFlags::MS_NOSUID
694 | MsFlags::MS_NODEV
695 | MsFlags::MS_NOEXEC,
696 None::<&str>,
697 )
698 .map_err(|e| {
699 NucleusError::FilesystemError(format!(
700 "Failed to remount procfs read-only: {}",
701 e
702 ))
703 })?;
704 info!("Successfully mounted procfs (read-only)");
705 } else {
706 info!("Successfully mounted procfs");
707 }
708 Ok(())
709 }
710 Err(e) => {
711 if best_effort {
712 warn!("Failed to mount procfs: {} (continuing anyway)", e);
713 Ok(())
714 } else {
715 Err(NucleusError::FilesystemError(format!(
716 "Failed to mount procfs: {}",
717 e
718 )))
719 }
720 }
721 }
722}
723
724pub const PROC_NULL_MASKED: &[&str] = &[
728 "kallsyms",
729 "kcore",
730 "sched_debug",
731 "timer_list",
732 "timer_stats",
733 "keys",
734 "latency_stats",
735 "config.gz",
736 "sysrq-trigger",
737 "kpagecount",
738 "kpageflags",
739 "kpagecgroup",
740];
741
742pub const PROC_TMPFS_MASKED: &[&str] = &["acpi", "bus", "irq", "scsi", "sys"];
744
745pub fn mask_proc_paths(proc_path: &Path, production: bool) -> Result<()> {
753 info!("Masking sensitive /proc paths");
754
755 const CRITICAL_PROC_PATHS: &[&str] = &["kcore", "kallsyms", "sysrq-trigger"];
756
757 let dev_null = Path::new("/dev/null");
758
759 for name in PROC_NULL_MASKED {
760 let target = proc_path.join(name);
761 if !target.exists() {
762 continue;
763 }
764 match mount(
765 Some(dev_null),
766 &target,
767 None::<&str>,
768 MsFlags::MS_BIND,
769 None::<&str>,
770 ) {
771 Ok(_) => {
772 if let Err(e) = mount(
775 None::<&str>,
776 &target,
777 None::<&str>,
778 MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_RDONLY,
779 None::<&str>,
780 ) {
781 if production && CRITICAL_PROC_PATHS.contains(name) {
782 return Err(NucleusError::FilesystemError(format!(
783 "Failed to remount /proc/{} read-only in production mode: {}",
784 name, e
785 )));
786 }
787 warn!(
788 "Failed to remount /proc/{} read-only: {} (continuing)",
789 name, e
790 );
791 }
792 debug!("Masked /proc/{} (read-only)", name);
793 }
794 Err(e) => {
795 if production && CRITICAL_PROC_PATHS.contains(name) {
796 return Err(NucleusError::FilesystemError(format!(
797 "Failed to mask critical /proc/{} in production mode: {}",
798 name, e
799 )));
800 }
801 warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
802 }
803 }
804 }
805
806 for name in PROC_TMPFS_MASKED {
807 let target = proc_path.join(name);
808 if !target.exists() {
809 continue;
810 }
811 match mount(
812 Some("tmpfs"),
813 &target,
814 Some("tmpfs"),
815 MsFlags::MS_RDONLY | MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
816 Some("size=0"),
817 ) {
818 Ok(_) => debug!("Masked /proc/{}", name),
819 Err(e) => {
820 if production {
821 return Err(NucleusError::FilesystemError(format!(
822 "Failed to mask /proc/{} in production mode: {}",
823 name, e
824 )));
825 }
826 warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
827 }
828 }
829 }
830
831 info!("Finished masking sensitive /proc paths");
832 Ok(())
833}
834
835pub fn switch_root(new_root: &Path, allow_chroot_fallback: bool) -> Result<()> {
840 info!("Switching root to {:?}", new_root);
841
842 match pivot_root_impl(new_root) {
843 Ok(()) => {
844 info!("Successfully switched root using pivot_root");
845 Ok(())
846 }
847 Err(e) => {
848 if allow_chroot_fallback {
849 warn!(
850 "pivot_root failed ({}), falling back to chroot due to explicit \
851 configuration",
852 e
853 );
854 chroot_impl(new_root)
855 } else {
856 Err(NucleusError::PivotRootError(format!(
857 "pivot_root failed: {}. chroot fallback is disabled by default; use \
858 --allow-chroot-fallback to allow weaker isolation",
859 e
860 )))
861 }
862 }
863 }
864}
865
866fn pivot_root_impl(new_root: &Path) -> Result<()> {
872 use nix::unistd::pivot_root;
873
874 let old_root = new_root.join(".old_root");
878 std::fs::create_dir_all(&old_root).map_err(|e| {
879 NucleusError::PivotRootError(format!("Failed to create old_root directory: {}", e))
880 })?;
881
882 pivot_root(new_root, &old_root)
884 .map_err(|e| NucleusError::PivotRootError(format!("pivot_root syscall failed: {}", e)))?;
885
886 std::env::set_current_dir("/")
888 .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
889
890 nix::mount::umount2("/.old_root", nix::mount::MntFlags::MNT_DETACH)
892 .map_err(|e| NucleusError::PivotRootError(format!("Failed to unmount old root: {}", e)))?;
893
894 let _ = std::fs::remove_dir("/.old_root");
896
897 Ok(())
898}
899
900fn chroot_impl(new_root: &Path) -> Result<()> {
904 chroot(new_root)
905 .map_err(|e| NucleusError::PivotRootError(format!("chroot syscall failed: {}", e)))?;
906
907 std::env::set_current_dir("/")
909 .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
910
911 if let Err(e) = caps::drop(
914 None,
915 caps::CapSet::Bounding,
916 caps::Capability::CAP_SYS_CHROOT,
917 ) {
918 debug!(
919 "Could not drop CAP_SYS_CHROOT after chroot: {} (may not be present)",
920 e
921 );
922 }
923 if let Err(e) = caps::drop(
924 None,
925 caps::CapSet::Effective,
926 caps::Capability::CAP_SYS_CHROOT,
927 ) {
928 debug!(
929 "Could not drop effective CAP_SYS_CHROOT: {} (may not be present)",
930 e
931 );
932 }
933 if let Err(e) = caps::drop(
934 None,
935 caps::CapSet::Permitted,
936 caps::Capability::CAP_SYS_CHROOT,
937 ) {
938 debug!(
939 "Could not drop permitted CAP_SYS_CHROOT: {} (may not be present)",
940 e
941 );
942 }
943
944 info!("Successfully switched root using chroot (CAP_SYS_CHROOT dropped)");
945
946 Ok(())
947}
948
949pub fn mount_secrets(root: &Path, secrets: &[crate::container::SecretMount]) -> Result<()> {
954 if secrets.is_empty() {
955 return Ok(());
956 }
957
958 info!("Mounting {} secret(s) into container", secrets.len());
959
960 for secret in secrets {
961 let source_fd = open(
962 &secret.source,
963 OFlag::O_PATH | OFlag::O_NOFOLLOW | OFlag::O_CLOEXEC,
964 Mode::empty(),
965 )
966 .map_err(|e| {
967 NucleusError::FilesystemError(format!(
968 "Failed to open secret source {:?} with O_NOFOLLOW: {}",
969 secret.source, e
970 ))
971 })?;
972 let source_stat = fstat(&source_fd).map_err(|e| {
973 NucleusError::FilesystemError(format!(
974 "Failed to stat secret source {:?}: {}",
975 secret.source, e
976 ))
977 })?;
978 let source_kind = SFlag::from_bits_truncate(source_stat.st_mode);
979 let source_is_file = source_kind == SFlag::S_IFREG;
980 let source_is_dir = source_kind == SFlag::S_IFDIR;
981 if !source_is_file && !source_is_dir {
982 return Err(NucleusError::FilesystemError(format!(
983 "Secret source {:?} must be a regular file or directory",
984 secret.source
985 )));
986 }
987 let source_fd_path = PathBuf::from(format!("/proc/self/fd/{}", source_fd.as_raw_fd()));
988
989 let dest = resolve_container_destination(root, &secret.dest)?;
991
992 if let Some(parent) = dest.parent() {
994 std::fs::create_dir_all(parent).map_err(|e| {
995 NucleusError::FilesystemError(format!(
996 "Failed to create secret mount parent {:?}: {}",
997 parent, e
998 ))
999 })?;
1000 }
1001
1002 if source_is_file {
1004 std::fs::write(&dest, "").map_err(|e| {
1005 NucleusError::FilesystemError(format!(
1006 "Failed to create secret mount point {:?}: {}",
1007 dest, e
1008 ))
1009 })?;
1010 } else {
1011 std::fs::create_dir_all(&dest).map_err(|e| {
1012 NucleusError::FilesystemError(format!(
1013 "Failed to create secret mount dir {:?}: {}",
1014 dest, e
1015 ))
1016 })?;
1017 }
1018
1019 mount(
1021 Some(source_fd_path.as_path()),
1022 &dest,
1023 None::<&str>,
1024 MsFlags::MS_BIND,
1025 None::<&str>,
1026 )
1027 .map_err(|e| {
1028 NucleusError::FilesystemError(format!(
1029 "Failed to bind mount secret {:?}: {}",
1030 secret.source, e
1031 ))
1032 })?;
1033
1034 mount(
1035 None::<&str>,
1036 &dest,
1037 None::<&str>,
1038 MsFlags::MS_REMOUNT
1039 | MsFlags::MS_BIND
1040 | MsFlags::MS_RDONLY
1041 | MsFlags::MS_NOSUID
1042 | MsFlags::MS_NODEV
1043 | MsFlags::MS_NOEXEC,
1044 None::<&str>,
1045 )
1046 .map_err(|e| {
1047 NucleusError::FilesystemError(format!(
1048 "Failed to remount secret {:?} read-only: {}",
1049 dest, e
1050 ))
1051 })?;
1052
1053 if source_is_file {
1055 use std::os::unix::fs::PermissionsExt;
1056 let perms = std::fs::Permissions::from_mode(secret.mode);
1057 if let Err(e) = std::fs::set_permissions(&dest, perms) {
1058 warn!(
1059 "Failed to set mode {:04o} on secret {:?}: {} (bind mount may override)",
1060 secret.mode, dest, e
1061 );
1062 }
1063 }
1064
1065 debug!(
1066 "Mounted secret {:?} -> {:?} (mode {:04o})",
1067 secret.source, secret.dest, secret.mode
1068 );
1069 }
1070
1071 Ok(())
1072}
1073
1074pub fn mount_secrets_inmemory(
1080 root: &Path,
1081 secrets: &[crate::container::SecretMount],
1082 identity: &crate::container::ProcessIdentity,
1083) -> Result<()> {
1084 if secrets.is_empty() {
1085 return Ok(());
1086 }
1087
1088 info!("Mounting {} secret(s) on in-memory tmpfs", secrets.len());
1089
1090 let secrets_dir = root.join("run/secrets");
1091 std::fs::create_dir_all(&secrets_dir).map_err(|e| {
1092 NucleusError::FilesystemError(format!(
1093 "Failed to create secrets dir {:?}: {}",
1094 secrets_dir, e
1095 ))
1096 })?;
1097
1098 if let Err(e) = mount(
1100 Some("tmpfs"),
1101 &secrets_dir,
1102 Some("tmpfs"),
1103 MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
1104 Some("size=16m,mode=0700"),
1105 ) {
1106 let _ = std::fs::remove_dir_all(&secrets_dir);
1107 return Err(NucleusError::FilesystemError(format!(
1108 "Failed to mount secrets tmpfs at {:?}: {}",
1109 secrets_dir, e
1110 )));
1111 }
1112
1113 if !identity.is_root() {
1114 nix::unistd::chown(
1115 &secrets_dir,
1116 Some(nix::unistd::Uid::from_raw(identity.uid)),
1117 Some(nix::unistd::Gid::from_raw(identity.gid)),
1118 )
1119 .map_err(|e| {
1120 let _ = nix::mount::umount2(&secrets_dir, nix::mount::MntFlags::MNT_DETACH);
1121 let _ = std::fs::remove_dir_all(&secrets_dir);
1122 NucleusError::FilesystemError(format!(
1123 "Failed to set /run/secrets owner to {}:{}: {}",
1124 identity.uid, identity.gid, e
1125 ))
1126 })?;
1127 }
1128
1129 let result = mount_secrets_inmemory_inner(&secrets_dir, root, secrets, identity);
1131 if let Err(ref e) = result {
1132 let _ = nix::mount::umount2(&secrets_dir, nix::mount::MntFlags::MNT_DETACH);
1133 let _ = std::fs::remove_dir_all(&secrets_dir);
1134 return Err(NucleusError::FilesystemError(format!(
1135 "Secret mount failed (rolled back): {}",
1136 e
1137 )));
1138 }
1139
1140 info!("All secrets mounted on in-memory tmpfs");
1141 Ok(())
1142}
1143
1144fn mount_secrets_inmemory_inner(
1145 secrets_dir: &Path,
1146 root: &Path,
1147 secrets: &[crate::container::SecretMount],
1148 identity: &crate::container::ProcessIdentity,
1149) -> Result<()> {
1150 for secret in secrets {
1151 let mut content = read_regular_file_nofollow(&secret.source)?;
1152
1153 let dest = resolve_container_destination(secrets_dir, &secret.dest)?;
1155
1156 if let Some(parent) = dest.parent() {
1158 std::fs::create_dir_all(parent).map_err(|e| {
1159 NucleusError::FilesystemError(format!(
1160 "Failed to create secret parent dir {:?}: {}",
1161 parent, e
1162 ))
1163 })?;
1164 }
1165
1166 std::fs::write(&dest, &content).map_err(|e| {
1168 NucleusError::FilesystemError(format!("Failed to write secret to {:?}: {}", dest, e))
1169 })?;
1170
1171 {
1173 use std::os::unix::fs::PermissionsExt;
1174 let perms = std::fs::Permissions::from_mode(secret.mode);
1175 std::fs::set_permissions(&dest, perms).map_err(|e| {
1176 NucleusError::FilesystemError(format!(
1177 "Failed to set permissions on secret {:?}: {}",
1178 dest, e
1179 ))
1180 })?;
1181 }
1182
1183 if !identity.is_root() {
1184 nix::unistd::chown(
1185 &dest,
1186 Some(nix::unistd::Uid::from_raw(identity.uid)),
1187 Some(nix::unistd::Gid::from_raw(identity.gid)),
1188 )
1189 .map_err(|e| {
1190 NucleusError::FilesystemError(format!(
1191 "Failed to set permissions owner on secret {:?} to {}:{}: {}",
1192 dest, identity.uid, identity.gid, e
1193 ))
1194 })?;
1195 }
1196
1197 zeroize::Zeroize::zeroize(&mut content);
1199 drop(content);
1200
1201 let container_dest = resolve_container_destination(root, &secret.dest)?;
1203 if container_dest != dest {
1204 if let Some(parent) = container_dest.parent() {
1205 std::fs::create_dir_all(parent).map_err(|e| {
1206 NucleusError::FilesystemError(format!(
1207 "Failed to create secret mount parent {:?}: {}",
1208 parent, e
1209 ))
1210 })?;
1211 }
1212
1213 std::fs::write(&container_dest, "").map_err(|e| {
1214 NucleusError::FilesystemError(format!(
1215 "Failed to create secret mount point {:?}: {}",
1216 container_dest, e
1217 ))
1218 })?;
1219
1220 mount(
1221 Some(dest.as_path()),
1222 &container_dest,
1223 None::<&str>,
1224 MsFlags::MS_BIND,
1225 None::<&str>,
1226 )
1227 .map_err(|e| {
1228 NucleusError::FilesystemError(format!(
1229 "Failed to bind mount secret {:?} -> {:?}: {}",
1230 dest, container_dest, e
1231 ))
1232 })?;
1233
1234 mount(
1235 None::<&str>,
1236 &container_dest,
1237 None::<&str>,
1238 MsFlags::MS_REMOUNT
1239 | MsFlags::MS_BIND
1240 | MsFlags::MS_RDONLY
1241 | MsFlags::MS_NOSUID
1242 | MsFlags::MS_NODEV
1243 | MsFlags::MS_NOEXEC,
1244 None::<&str>,
1245 )
1246 .map_err(|e| {
1247 NucleusError::FilesystemError(format!(
1248 "Failed to remount secret {:?} read-only: {}",
1249 container_dest, e
1250 ))
1251 })?;
1252 }
1253
1254 debug!(
1255 "Secret {:?} -> {:?} (in-memory tmpfs, mode {:04o})",
1256 secret.source, secret.dest, secret.mode
1257 );
1258 }
1259
1260 Ok(())
1261}
1262
1263#[cfg(test)]
1264mod tests {
1265 use super::*;
1266 use std::os::unix::fs::symlink;
1267
1268 #[test]
1269 fn test_proc_mask_includes_sysrq_trigger() {
1270 assert!(
1271 PROC_NULL_MASKED.contains(&"sysrq-trigger"),
1272 "/proc/sysrq-trigger must be masked to prevent host DoS"
1273 );
1274 }
1275
1276 #[test]
1277 fn test_proc_mask_includes_timer_stats() {
1278 assert!(
1279 PROC_NULL_MASKED.contains(&"timer_stats"),
1280 "/proc/timer_stats must be masked to prevent kernel info leakage"
1281 );
1282 }
1283
1284 #[test]
1285 fn test_proc_mask_includes_kpage_files() {
1286 for path in &["kpagecount", "kpageflags", "kpagecgroup"] {
1287 assert!(
1288 PROC_NULL_MASKED.contains(path),
1289 "/proc/{} must be masked to prevent host memory layout leakage",
1290 path
1291 );
1292 }
1293 }
1294
1295 #[test]
1296 fn test_proc_mask_includes_oci_standard_paths() {
1297 for path in &["kallsyms", "kcore", "sched_debug", "keys", "config.gz"] {
1299 assert!(
1300 PROC_NULL_MASKED.contains(path),
1301 "/proc/{} must be in null-masked list (OCI spec)",
1302 path
1303 );
1304 }
1305 for path in &["acpi", "bus", "scsi", "sys"] {
1306 assert!(
1307 PROC_TMPFS_MASKED.contains(path),
1308 "/proc/{} must be in tmpfs-masked list (OCI spec)",
1309 path
1310 );
1311 }
1312 }
1313
1314 #[test]
1315 fn test_read_regular_file_nofollow_reads_regular_file() {
1316 let temp = tempfile::tempdir().unwrap();
1317 let path = temp.path().join("secret.txt");
1318 std::fs::write(&path, "supersecret").unwrap();
1319
1320 let content = read_regular_file_nofollow(&path).unwrap();
1321 assert_eq!(content, b"supersecret");
1322 }
1323
1324 #[test]
1325 fn test_read_regular_file_nofollow_rejects_symlink() {
1326 let temp = tempfile::tempdir().unwrap();
1327 let target = temp.path().join("target.txt");
1328 let link = temp.path().join("secret-link");
1329 std::fs::write(&target, "supersecret").unwrap();
1330 symlink(&target, &link).unwrap();
1331
1332 let err = read_regular_file_nofollow(&link).unwrap_err();
1333 assert!(
1334 err.to_string().contains("O_NOFOLLOW"),
1335 "symlink reads must fail via O_NOFOLLOW"
1336 );
1337 }
1338}