1use crate::error::{NucleusError, Result};
2use nix::fcntl::{open, OFlag};
3use nix::mount::{mount, MsFlags};
4use nix::sys::stat::{fstat, makedev, mknod, Mode, SFlag};
5use nix::unistd::chroot;
6use std::fs::OpenOptions;
7use std::io::Read;
8use std::os::fd::AsRawFd;
9use std::os::unix::fs::OpenOptionsExt;
10use std::path::{Component, Path, PathBuf};
11use tracing::{debug, info, warn};
12
13struct ExpectedMount {
15 path: &'static str,
16 required_flags: &'static [&'static str],
17 critical: bool,
20}
21
22const PRODUCTION_MOUNT_EXPECTATIONS: &[ExpectedMount] = &[
24 ExpectedMount {
25 path: "/bin",
26 required_flags: &["ro", "nosuid", "nodev"],
27 critical: true,
28 },
29 ExpectedMount {
30 path: "/usr",
31 required_flags: &["ro", "nosuid", "nodev"],
32 critical: true,
33 },
34 ExpectedMount {
35 path: "/lib",
36 required_flags: &["ro", "nosuid", "nodev"],
37 critical: false, },
39 ExpectedMount {
40 path: "/lib64",
41 required_flags: &["ro", "nosuid", "nodev"],
42 critical: false, },
44 ExpectedMount {
45 path: "/etc",
46 required_flags: &["ro", "nosuid", "nodev"],
47 critical: true,
48 },
49 ExpectedMount {
50 path: "/nix",
51 required_flags: &["ro", "nosuid", "nodev"],
52 critical: false, },
54 ExpectedMount {
55 path: "/sbin",
56 required_flags: &["ro", "nosuid", "nodev"],
57 critical: false, },
59 ExpectedMount {
60 path: "/proc",
61 required_flags: &["nosuid", "nodev", "noexec"],
62 critical: true,
63 },
64 ExpectedMount {
65 path: "/run/secrets",
66 required_flags: &["nosuid", "nodev", "noexec"],
67 critical: false, },
69];
70
71pub fn normalize_container_destination(dest: &Path) -> Result<PathBuf> {
76 if !dest.is_absolute() {
77 return Err(NucleusError::ConfigError(format!(
78 "Container destination must be absolute: {:?}",
79 dest
80 )));
81 }
82
83 let mut normalized = PathBuf::from("/");
84 let mut saw_component = false;
85
86 for component in dest.components() {
87 match component {
88 Component::RootDir => {}
89 Component::CurDir => {}
90 Component::Normal(part) => {
91 normalized.push(part);
92 saw_component = true;
93 }
94 Component::ParentDir => {
95 return Err(NucleusError::ConfigError(format!(
96 "Container destination must not contain parent traversal: {:?}",
97 dest
98 )));
99 }
100 Component::Prefix(_) => {
101 return Err(NucleusError::ConfigError(format!(
102 "Unsupported container destination prefix: {:?}",
103 dest
104 )));
105 }
106 }
107 }
108
109 if !saw_component {
110 return Err(NucleusError::ConfigError(format!(
111 "Container destination must not be the root directory: {:?}",
112 dest
113 )));
114 }
115
116 Ok(normalized)
117}
118
119pub fn resolve_container_destination(root: &Path, dest: &Path) -> Result<PathBuf> {
121 let normalized = normalize_container_destination(dest)?;
122 let relative = normalized.strip_prefix("/").map_err(|_| {
123 NucleusError::ConfigError(format!(
124 "Container destination is not absolute after normalization: {:?}",
125 normalized
126 ))
127 })?;
128 Ok(root.join(relative))
129}
130
131pub(crate) fn read_regular_file_nofollow(path: &Path) -> Result<Vec<u8>> {
132 let mut file = OpenOptions::new()
133 .read(true)
134 .custom_flags(libc::O_NOFOLLOW | libc::O_CLOEXEC)
135 .open(path)
136 .map_err(|e| {
137 NucleusError::FilesystemError(format!(
138 "Failed to open file {:?} with O_NOFOLLOW: {}",
139 path, e
140 ))
141 })?;
142
143 let metadata = file.metadata().map_err(|e| {
144 NucleusError::FilesystemError(format!("Failed to stat file {:?}: {}", path, e))
145 })?;
146 if !metadata.is_file() {
147 return Err(NucleusError::FilesystemError(format!(
148 "Expected regular file for {:?}, found non-file source",
149 path
150 )));
151 }
152
153 let mut content = Vec::new();
154 file.read_to_end(&mut content).map_err(|e| {
155 NucleusError::FilesystemError(format!("Failed to read file {:?}: {}", path, e))
156 })?;
157 Ok(content)
158}
159
160fn decode_mountinfo_field(field: &str) -> String {
161 let mut decoded = String::with_capacity(field.len());
162 let mut chars = field.chars().peekable();
163
164 while let Some(ch) = chars.next() {
165 if ch == '\\' {
166 let code: String = chars.by_ref().take(3).collect();
167 match code.as_str() {
168 "040" => decoded.push(' '),
169 "011" => decoded.push('\t'),
170 "012" => decoded.push('\n'),
171 "134" => decoded.push('\\'),
172 _ => {
173 decoded.push('\\');
174 decoded.push_str(&code);
175 }
176 }
177 } else {
178 decoded.push(ch);
179 }
180 }
181
182 decoded
183}
184
185fn parse_mountinfo_line(line: &str) -> Option<(String, std::collections::HashSet<String>)> {
186 let (left, _) = line.split_once(" - ")?;
187 let fields: Vec<&str> = left.split_whitespace().collect();
188 if fields.len() < 6 {
189 return None;
190 }
191
192 let mount_point = decode_mountinfo_field(fields[4]);
193 let options = fields[5]
194 .split(',')
195 .map(str::trim)
196 .filter(|opt| !opt.is_empty())
197 .map(str::to_string)
198 .collect();
199
200 Some((mount_point, options))
201}
202
203pub fn audit_mounts(production_mode: bool) -> Result<()> {
209 let mounts_content = std::fs::read_to_string("/proc/self/mountinfo").map_err(|e| {
210 NucleusError::FilesystemError(format!("Failed to read /proc/self/mountinfo: {}", e))
211 })?;
212 let mount_table: std::collections::HashMap<String, std::collections::HashSet<String>> =
213 mounts_content
214 .lines()
215 .filter_map(parse_mountinfo_line)
216 .collect();
217
218 let mut violations = Vec::new();
219
220 for expectation in PRODUCTION_MOUNT_EXPECTATIONS {
221 if let Some(options) = mount_table.get(expectation.path) {
222 for &flag in expectation.required_flags {
223 if !options.contains(flag) {
224 let rendered = options
225 .iter()
226 .map(String::as_str)
227 .collect::<Vec<_>>()
228 .join(",");
229 violations.push(format!(
230 "Mount {} missing required flag '{}' (has: {})",
231 expectation.path, flag, rendered
232 ));
233 }
234 }
235 } else if expectation.critical && production_mode {
236 violations.push(format!(
237 "Critical mount {} is missing from the mount namespace",
238 expectation.path
239 ));
240 }
241 }
242
243 if violations.is_empty() {
244 info!("Mount audit passed: all expected flags verified");
245 Ok(())
246 } else if production_mode {
247 Err(NucleusError::FilesystemError(format!(
248 "Mount audit failed in production mode:\n {}",
249 violations.join("\n ")
250 )))
251 } else {
252 for v in &violations {
253 warn!("Mount audit: {}", v);
254 }
255 Ok(())
256 }
257}
258
259pub fn create_minimal_fs(root: &Path) -> Result<()> {
261 info!("Creating minimal filesystem structure at {:?}", root);
262
263 let dirs = vec![
265 "dev",
266 "proc",
267 "sys",
268 "tmp",
269 "bin",
270 "sbin",
271 "usr",
272 "lib",
273 "lib64",
274 "etc",
275 "nix",
276 "nix/store",
277 "run",
278 "context",
279 ];
280
281 for dir in dirs {
282 let path = root.join(dir);
283 std::fs::create_dir_all(&path).map_err(|e| {
284 NucleusError::FilesystemError(format!("Failed to create directory {:?}: {}", path, e))
285 })?;
286 }
287
288 info!("Created minimal filesystem structure");
289
290 Ok(())
291}
292
293pub fn create_dev_nodes(dev_path: &Path, include_tty: bool) -> Result<()> {
297 info!("Creating device nodes at {:?}", dev_path);
298
299 let mut devices = vec![
301 ("null", SFlag::S_IFCHR, 1, 3),
302 ("zero", SFlag::S_IFCHR, 1, 5),
303 ("full", SFlag::S_IFCHR, 1, 7),
304 ("random", SFlag::S_IFCHR, 1, 8),
305 ("urandom", SFlag::S_IFCHR, 1, 9),
306 ];
307 if include_tty {
308 devices.push(("tty", SFlag::S_IFCHR, 5, 0));
309 }
310
311 let mut created_count = 0;
312 let mut failed_count = 0;
313
314 for (name, dev_type, major, minor) in devices {
315 let path = dev_path.join(name);
316 let mode = Mode::from_bits_truncate(0o660);
317 let dev = makedev(major, minor);
318
319 match mknod(&path, dev_type, mode, dev) {
320 Ok(_) => {
321 info!("Created device node: {:?}", path);
322 created_count += 1;
323 }
324 Err(e) => {
325 warn!(
327 "Failed to create device node {:?}: {} (this is normal in rootless mode)",
328 path, e
329 );
330 failed_count += 1;
331 }
332 }
333 }
334
335 if created_count > 0 {
336 info!("Successfully created {} device nodes", created_count);
337 }
338 if failed_count > 0 {
339 info!("Skipped {} device nodes (rootless mode)", failed_count);
340 }
341
342 Ok(())
343}
344
345pub fn bind_mount_rootfs(root: &Path, rootfs_path: &Path) -> Result<()> {
350 info!(
351 "Bind mounting production rootfs {:?} into container {:?}",
352 rootfs_path, root
353 );
354
355 if std::fs::symlink_metadata(rootfs_path).is_err() {
356 return Err(NucleusError::FilesystemError(format!(
357 "Rootfs path does not exist: {:?}",
358 rootfs_path
359 )));
360 }
361
362 let subdirs = ["bin", "sbin", "lib", "lib64", "usr", "etc", "nix"];
366
367 for subdir in &subdirs {
368 let source = rootfs_path.join(subdir);
369 if !source.exists() {
370 debug!("Rootfs subdir {} not present, skipping", subdir);
371 continue;
372 }
373
374 let target = root.join(subdir);
375 std::fs::create_dir_all(&target).map_err(|e| {
376 NucleusError::FilesystemError(format!(
377 "Failed to create mount point {:?}: {}",
378 target, e
379 ))
380 })?;
381
382 mount(
383 Some(&source),
384 &target,
385 None::<&str>,
386 MsFlags::MS_BIND | MsFlags::MS_REC,
387 None::<&str>,
388 )
389 .map_err(|e| {
390 NucleusError::FilesystemError(format!(
391 "Failed to bind mount rootfs {:?} -> {:?}: {}",
392 source, target, e
393 ))
394 })?;
395
396 mount(
398 None::<&str>,
399 &target,
400 None::<&str>,
401 MsFlags::MS_REMOUNT
402 | MsFlags::MS_BIND
403 | MsFlags::MS_RDONLY
404 | MsFlags::MS_REC
405 | MsFlags::MS_NOSUID
406 | MsFlags::MS_NODEV,
407 None::<&str>,
408 )
409 .map_err(|e| {
410 NucleusError::FilesystemError(format!(
411 "Failed to remount rootfs {:?} read-only: {}",
412 target, e
413 ))
414 })?;
415
416 info!("Mounted rootfs/{} read-only", subdir);
417 }
418
419 Ok(())
420}
421
422pub fn bind_mount_host_paths(root: &Path, best_effort: bool) -> Result<()> {
427 info!("Bind mounting host paths into container");
428
429 let host_paths = vec![
431 "/bin", "/usr", "/lib", "/lib64", "/nix", ];
433
434 for host_path in host_paths {
435 let host = Path::new(host_path);
436
437 if !host.exists() {
439 debug!("Skipping {} (not present on host)", host_path);
440 continue;
441 }
442
443 let container_path = root.join(host_path.trim_start_matches('/'));
444
445 if let Err(e) = std::fs::create_dir_all(&container_path) {
447 if best_effort {
448 warn!("Failed to create mount point {:?}: {}", container_path, e);
449 continue;
450 }
451 return Err(NucleusError::FilesystemError(format!(
452 "Failed to create mount point {:?}: {}",
453 container_path, e
454 )));
455 }
456
457 match mount(
461 Some(host),
462 &container_path,
463 None::<&str>,
464 MsFlags::MS_BIND | MsFlags::MS_REC,
465 None::<&str>,
466 ) {
467 Ok(_) => {
468 mount(
470 None::<&str>,
471 &container_path,
472 None::<&str>,
473 MsFlags::MS_REMOUNT
474 | MsFlags::MS_BIND
475 | MsFlags::MS_RDONLY
476 | MsFlags::MS_REC
477 | MsFlags::MS_NOSUID
478 | MsFlags::MS_NODEV,
479 None::<&str>,
480 )
481 .map_err(|e| {
482 NucleusError::FilesystemError(format!(
483 "Failed to remount {} as read-only: {}",
484 host_path, e
485 ))
486 })?;
487 info!(
488 "Bind mounted {} to {:?} (read-only)",
489 host_path, container_path
490 );
491 }
492 Err(e) => {
493 if best_effort {
494 warn!(
495 "Failed to bind mount {}: {} (continuing anyway)",
496 host_path, e
497 );
498 } else {
499 return Err(NucleusError::FilesystemError(format!(
500 "Failed to bind mount {}: {}",
501 host_path, e
502 )));
503 }
504 }
505 }
506 }
507
508 Ok(())
509}
510
511const DENIED_BIND_MOUNT_SOURCES_EXACT: &[&str] = &[
513 "/",
514 "/etc/shadow",
515 "/etc/sudoers",
516 "/etc/passwd",
517 "/etc/gshadow",
518];
519
520const DENIED_BIND_MOUNT_SOURCE_PREFIXES: &[&str] = &["/proc", "/sys", "/dev", "/boot"];
522
523fn normalize_bind_mount_source_for_policy(source: &Path) -> Result<PathBuf> {
524 if !source.is_absolute() {
525 return Err(NucleusError::ConfigError(format!(
526 "Bind mount source must be absolute: {:?}",
527 source
528 )));
529 }
530
531 let mut normalized = PathBuf::from("/");
532
533 for component in source.components() {
534 match component {
535 Component::RootDir => {}
536 Component::CurDir => {}
537 Component::Normal(part) => normalized.push(part),
538 Component::ParentDir => {
539 normalized.pop();
540 if normalized.as_os_str().is_empty() {
541 normalized.push("/");
542 }
543 }
544 Component::Prefix(_) => {
545 return Err(NucleusError::ConfigError(format!(
546 "Unsupported bind mount source prefix: {:?}",
547 source
548 )));
549 }
550 }
551 }
552
553 Ok(normalized)
554}
555
556fn reject_denied_bind_mount_source(source: &Path) -> Result<()> {
557 for denied in DENIED_BIND_MOUNT_SOURCES_EXACT {
558 if source == Path::new(denied) {
559 return Err(NucleusError::ConfigError(format!(
560 "Bind mount source '{}' is a sensitive host path and cannot be mounted into containers",
561 source.display()
562 )));
563 }
564 }
565
566 for denied in DENIED_BIND_MOUNT_SOURCE_PREFIXES {
567 let denied_path = Path::new(denied);
568 if source == denied_path || source.starts_with(denied_path) {
569 return Err(NucleusError::ConfigError(format!(
570 "Bind mount source '{}' is under sensitive host path '{}' and cannot be mounted into containers",
571 source.display(),
572 denied
573 )));
574 }
575 }
576
577 Ok(())
578}
579
580pub fn validate_bind_mount_source(source: &Path) -> Result<()> {
582 let normalized = normalize_bind_mount_source_for_policy(source)?;
583 reject_denied_bind_mount_source(&normalized)?;
584
585 let canonical = std::fs::canonicalize(source).map_err(|e| {
586 NucleusError::ConfigError(format!(
587 "Failed to resolve bind mount source {:?}: {}",
588 source, e
589 ))
590 })?;
591 reject_denied_bind_mount_source(&canonical)
592}
593
594pub fn mount_volumes(root: &Path, volumes: &[crate::container::VolumeMount]) -> Result<()> {
596 use crate::container::VolumeSource;
597
598 if volumes.is_empty() {
599 return Ok(());
600 }
601
602 info!("Mounting {} volume(s) into container", volumes.len());
603
604 for volume in volumes {
605 let dest = resolve_container_destination(root, &volume.dest)?;
606
607 match &volume.source {
608 VolumeSource::Bind { source } => {
609 validate_bind_mount_source(source)?;
611
612 if std::fs::symlink_metadata(source).is_err() {
615 return Err(NucleusError::FilesystemError(format!(
616 "Volume source does not exist: {:?}",
617 source
618 )));
619 }
620
621 if let Some(parent) = dest.parent() {
622 std::fs::create_dir_all(parent).map_err(|e| {
623 NucleusError::FilesystemError(format!(
624 "Failed to create volume mount parent {:?}: {}",
625 parent, e
626 ))
627 })?;
628 }
629
630 let recursive = source.is_dir();
631 if source.is_file() {
632 std::fs::write(&dest, "").map_err(|e| {
633 NucleusError::FilesystemError(format!(
634 "Failed to create volume mount point {:?}: {}",
635 dest, e
636 ))
637 })?;
638 } else {
639 std::fs::create_dir_all(&dest).map_err(|e| {
640 NucleusError::FilesystemError(format!(
641 "Failed to create volume mount dir {:?}: {}",
642 dest, e
643 ))
644 })?;
645 }
646
647 let initial_flags = if recursive {
648 MsFlags::MS_BIND | MsFlags::MS_REC
649 } else {
650 MsFlags::MS_BIND
651 };
652 mount(
653 Some(source.as_path()),
654 &dest,
655 None::<&str>,
656 initial_flags,
657 None::<&str>,
658 )
659 .map_err(|e| {
660 NucleusError::FilesystemError(format!(
661 "Failed to bind mount volume {:?} -> {:?}: {}",
662 source, dest, e
663 ))
664 })?;
665
666 let mut remount_flags =
667 MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_NOSUID | MsFlags::MS_NODEV;
668 if recursive {
669 remount_flags |= MsFlags::MS_REC;
670 }
671 if volume.read_only {
672 remount_flags |= MsFlags::MS_RDONLY;
673 }
674
675 mount(
676 None::<&str>,
677 &dest,
678 None::<&str>,
679 remount_flags,
680 None::<&str>,
681 )
682 .map_err(|e| {
683 NucleusError::FilesystemError(format!(
684 "Failed to remount volume {:?} with final flags: {}",
685 dest, e
686 ))
687 })?;
688
689 info!(
690 "Mounted bind volume {:?} -> {:?} ({})",
691 source,
692 volume.dest,
693 if volume.read_only { "ro" } else { "rw" }
694 );
695 }
696 VolumeSource::Tmpfs { size } => {
697 std::fs::create_dir_all(&dest).map_err(|e| {
698 NucleusError::FilesystemError(format!(
699 "Failed to create tmpfs mount dir {:?}: {}",
700 dest, e
701 ))
702 })?;
703
704 if let Some(value) = size.as_ref() {
707 let valid = value
708 .chars()
709 .all(|c| c.is_ascii_digit() || "kKmMgG".contains(c));
710 if !valid || value.is_empty() {
711 return Err(NucleusError::FilesystemError(format!(
712 "Invalid tmpfs size value '{}': only digits with optional K/M/G suffix allowed",
713 value
714 )));
715 }
716 }
717
718 let mount_data = size
721 .as_ref()
722 .map(|value| format!("size={},mode=0700", value))
723 .unwrap_or_else(|| "size=64M,mode=0700".to_string());
724
725 let mut flags = MsFlags::MS_NOSUID | MsFlags::MS_NODEV;
726 if volume.read_only {
727 flags |= MsFlags::MS_RDONLY;
728 }
729 mount(
730 Some("tmpfs"),
731 &dest,
732 Some("tmpfs"),
733 flags,
734 Some(mount_data.as_str()),
735 )
736 .map_err(|e| {
737 NucleusError::FilesystemError(format!(
738 "Failed to mount tmpfs volume at {:?}: {}",
739 dest, e
740 ))
741 })?;
742
743 info!(
744 "Mounted tmpfs volume at {:?}{}{}",
745 volume.dest,
746 size.as_ref()
747 .map(|value| format!(" (size={})", value))
748 .unwrap_or_default(),
749 if volume.read_only { " (ro)" } else { "" }
750 );
751 }
752 }
753 }
754
755 Ok(())
756}
757
758pub fn mount_procfs(
764 proc_path: &Path,
765 best_effort: bool,
766 read_only: bool,
767 hide_pids: bool,
768) -> Result<()> {
769 info!(
770 "Mounting procfs at {:?} (hidepid={})",
771 proc_path,
772 if hide_pids { "2" } else { "0" }
773 );
774
775 let mount_data: Option<&str> = if hide_pids { Some("hidepid=2") } else { None };
776 let mut used_hidepid = hide_pids;
777
778 let mounted = match mount(
779 Some("proc"),
780 proc_path,
781 Some("proc"),
782 MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
783 mount_data,
784 ) {
785 Ok(_) => true,
786 Err(e) if hide_pids && best_effort => {
787 warn!(
790 "Failed to mount procfs with hidepid=2: {} (retrying without hidepid)",
791 e
792 );
793 match mount(
794 Some("proc"),
795 proc_path,
796 Some("proc"),
797 MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
798 None::<&str>,
799 ) {
800 Ok(_) => {
801 used_hidepid = false;
802 true
803 }
804 Err(e) => {
805 warn!("Failed to mount procfs: {} (continuing anyway)", e);
806 false
807 }
808 }
809 }
810 Err(e) => {
811 if best_effort {
812 warn!("Failed to mount procfs: {} (continuing anyway)", e);
813 false
814 } else {
815 return Err(NucleusError::FilesystemError(format!(
816 "Failed to mount procfs: {}",
817 e
818 )));
819 }
820 }
821 };
822
823 if mounted {
824 if read_only {
825 mount(
826 None::<&str>,
827 proc_path,
828 None::<&str>,
829 MsFlags::MS_REMOUNT
830 | MsFlags::MS_RDONLY
831 | MsFlags::MS_NOSUID
832 | MsFlags::MS_NODEV
833 | MsFlags::MS_NOEXEC,
834 None::<&str>,
835 )
836 .map_err(|e| {
837 NucleusError::FilesystemError(format!("Failed to remount procfs read-only: {}", e))
838 })?;
839 if hide_pids && !used_hidepid {
840 info!("Successfully mounted procfs without hidepid (read-only)");
841 } else {
842 info!("Successfully mounted procfs (read-only)");
843 }
844 } else if hide_pids && !used_hidepid {
845 info!("Successfully mounted procfs without hidepid");
846 } else {
847 info!("Successfully mounted procfs");
848 }
849 }
850
851 Ok(())
852}
853
854pub const PROC_NULL_MASKED: &[&str] = &[
858 "kallsyms",
859 "kcore",
860 "sched_debug",
861 "timer_list",
862 "timer_stats",
863 "keys",
864 "latency_stats",
865 "config.gz",
866 "sysrq-trigger",
867 "kpagecount",
868 "kpageflags",
869 "kpagecgroup",
870];
871
872pub const PROC_READONLY_PATHS: &[&str] = &["bus", "fs", "irq", "sys"];
874
875pub const PROC_TMPFS_MASKED: &[&str] = &["acpi", "scsi"];
877
878fn remount_proc_path_readonly(target: &Path) -> Result<()> {
879 mount(
880 Some(target),
881 target,
882 None::<&str>,
883 MsFlags::MS_BIND | MsFlags::MS_REC,
884 None::<&str>,
885 )
886 .map_err(|e| {
887 NucleusError::FilesystemError(format!(
888 "Failed to bind-mount {:?} onto itself for read-only remount: {}",
889 target, e
890 ))
891 })?;
892
893 mount(
894 None::<&str>,
895 target,
896 None::<&str>,
897 MsFlags::MS_REMOUNT
898 | MsFlags::MS_BIND
899 | MsFlags::MS_RDONLY
900 | MsFlags::MS_NOSUID
901 | MsFlags::MS_NODEV
902 | MsFlags::MS_NOEXEC,
903 None::<&str>,
904 )
905 .map_err(|e| {
906 NucleusError::FilesystemError(format!("Failed to remount {:?} read-only: {}", target, e))
907 })?;
908
909 Ok(())
910}
911
912pub fn mask_proc_paths(proc_path: &Path, production: bool) -> Result<()> {
920 info!("Masking sensitive /proc paths");
921
922 const CRITICAL_PROC_PATHS: &[&str] = &["kcore", "kallsyms", "sysrq-trigger"];
923
924 for name in PROC_READONLY_PATHS {
925 let target = proc_path.join(name);
926 if !target.exists() {
927 continue;
928 }
929 match remount_proc_path_readonly(&target) {
930 Ok(_) => debug!("Remounted /proc/{} read-only", name),
931 Err(e) => {
932 if production {
933 return Err(NucleusError::FilesystemError(format!(
934 "Failed to remount /proc/{} read-only in production mode: {}",
935 name, e
936 )));
937 }
938 warn!(
939 "Failed to remount /proc/{} read-only: {} (continuing)",
940 name, e
941 );
942 }
943 }
944 }
945
946 let dev_null = Path::new("/dev/null");
947
948 for name in PROC_NULL_MASKED {
949 let target = proc_path.join(name);
950 if !target.exists() {
951 continue;
952 }
953 match mount(
954 Some(dev_null),
955 &target,
956 None::<&str>,
957 MsFlags::MS_BIND,
958 None::<&str>,
959 ) {
960 Ok(_) => {
961 if let Err(e) = mount(
964 None::<&str>,
965 &target,
966 None::<&str>,
967 MsFlags::MS_REMOUNT | MsFlags::MS_BIND | MsFlags::MS_RDONLY,
968 None::<&str>,
969 ) {
970 if production && CRITICAL_PROC_PATHS.contains(name) {
971 return Err(NucleusError::FilesystemError(format!(
972 "Failed to remount /proc/{} read-only in production mode: {}",
973 name, e
974 )));
975 }
976 warn!(
977 "Failed to remount /proc/{} read-only: {} (continuing)",
978 name, e
979 );
980 }
981 debug!("Masked /proc/{} (read-only)", name);
982 }
983 Err(e) => {
984 if production && CRITICAL_PROC_PATHS.contains(name) {
985 return Err(NucleusError::FilesystemError(format!(
986 "Failed to mask critical /proc/{} in production mode: {}",
987 name, e
988 )));
989 }
990 warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
991 }
992 }
993 }
994
995 for name in PROC_TMPFS_MASKED {
996 let target = proc_path.join(name);
997 if !target.exists() {
998 continue;
999 }
1000 match mount(
1001 Some("tmpfs"),
1002 &target,
1003 Some("tmpfs"),
1004 MsFlags::MS_RDONLY | MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
1005 Some("size=0"),
1006 ) {
1007 Ok(_) => debug!("Masked /proc/{}", name),
1008 Err(e) => {
1009 if production {
1010 return Err(NucleusError::FilesystemError(format!(
1011 "Failed to mask /proc/{} in production mode: {}",
1012 name, e
1013 )));
1014 }
1015 warn!("Failed to mask /proc/{}: {} (continuing)", name, e);
1016 }
1017 }
1018 }
1019
1020 info!("Finished masking sensitive /proc paths");
1021 Ok(())
1022}
1023
1024pub fn switch_root(new_root: &Path, allow_chroot_fallback: bool) -> Result<()> {
1029 info!("Switching root to {:?}", new_root);
1030
1031 match pivot_root_impl(new_root) {
1032 Ok(()) => {
1033 info!("Successfully switched root using pivot_root");
1034 Ok(())
1035 }
1036 Err(e) => {
1037 if allow_chroot_fallback {
1038 warn!(
1039 "pivot_root failed ({}), falling back to chroot due to explicit \
1040 configuration",
1041 e
1042 );
1043 chroot_impl(new_root)
1044 } else {
1045 Err(NucleusError::PivotRootError(format!(
1046 "pivot_root failed: {}. chroot fallback is disabled by default; use \
1047 --allow-chroot-fallback to allow weaker isolation",
1048 e
1049 )))
1050 }
1051 }
1052 }
1053}
1054
1055fn pivot_root_impl(new_root: &Path) -> Result<()> {
1061 use nix::unistd::pivot_root;
1062
1063 let old_root = new_root.join(".old_root");
1067 std::fs::create_dir_all(&old_root).map_err(|e| {
1068 NucleusError::PivotRootError(format!("Failed to create old_root directory: {}", e))
1069 })?;
1070
1071 pivot_root(new_root, &old_root)
1073 .map_err(|e| NucleusError::PivotRootError(format!("pivot_root syscall failed: {}", e)))?;
1074
1075 std::env::set_current_dir("/")
1077 .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
1078
1079 nix::mount::umount2("/.old_root", nix::mount::MntFlags::MNT_DETACH)
1081 .map_err(|e| NucleusError::PivotRootError(format!("Failed to unmount old root: {}", e)))?;
1082
1083 let _ = std::fs::remove_dir("/.old_root");
1085
1086 Ok(())
1087}
1088
1089fn chroot_impl(new_root: &Path) -> Result<()> {
1093 fn close_non_stdio_fds_after_chroot() -> Result<()> {
1094 let ret = unsafe { libc::syscall(libc::SYS_close_range, 3u32, u32::MAX, 0u32) };
1097 if ret == 0 {
1098 return Ok(());
1099 }
1100
1101 let max_fd = match unsafe { libc::sysconf(libc::_SC_OPEN_MAX) } {
1102 n if n > 3 && n <= i32::MAX as libc::c_long => n as i32,
1103 _ => 1024,
1104 };
1105
1106 for fd in 3..max_fd {
1107 if unsafe { libc::close(fd) } != 0 {
1108 let err = std::io::Error::last_os_error();
1109 if err.raw_os_error() != Some(libc::EBADF) {
1110 return Err(NucleusError::PivotRootError(format!(
1111 "Failed to close inherited fd {} after chroot: {}",
1112 fd, err
1113 )));
1114 }
1115 }
1116 }
1117
1118 Ok(())
1119 }
1120
1121 chroot(new_root)
1122 .map_err(|e| NucleusError::PivotRootError(format!("chroot syscall failed: {}", e)))?;
1123
1124 std::env::set_current_dir("/")
1126 .map_err(|e| NucleusError::PivotRootError(format!("Failed to chdir to /: {}", e)))?;
1127
1128 close_non_stdio_fds_after_chroot()?;
1129
1130 if let Err(e) = caps::drop(
1132 None,
1133 caps::CapSet::Bounding,
1134 caps::Capability::CAP_SYS_CHROOT,
1135 ) {
1136 debug!(
1137 "Could not drop CAP_SYS_CHROOT after chroot: {} (may not be present)",
1138 e
1139 );
1140 }
1141 if let Err(e) = caps::drop(
1142 None,
1143 caps::CapSet::Effective,
1144 caps::Capability::CAP_SYS_CHROOT,
1145 ) {
1146 debug!(
1147 "Could not drop effective CAP_SYS_CHROOT: {} (may not be present)",
1148 e
1149 );
1150 }
1151 if let Err(e) = caps::drop(
1152 None,
1153 caps::CapSet::Permitted,
1154 caps::Capability::CAP_SYS_CHROOT,
1155 ) {
1156 debug!(
1157 "Could not drop permitted CAP_SYS_CHROOT: {} (may not be present)",
1158 e
1159 );
1160 }
1161
1162 info!("Successfully switched root using chroot (CAP_SYS_CHROOT dropped)");
1163
1164 Ok(())
1165}
1166
1167pub fn mount_secrets(root: &Path, secrets: &[crate::container::SecretMount]) -> Result<()> {
1172 if secrets.is_empty() {
1173 return Ok(());
1174 }
1175
1176 info!("Mounting {} secret(s) into container", secrets.len());
1177
1178 for secret in secrets {
1179 let source_fd = open(
1180 &secret.source,
1181 OFlag::O_PATH | OFlag::O_NOFOLLOW | OFlag::O_CLOEXEC,
1182 Mode::empty(),
1183 )
1184 .map_err(|e| {
1185 NucleusError::FilesystemError(format!(
1186 "Failed to open secret source {:?} with O_NOFOLLOW: {}",
1187 secret.source, e
1188 ))
1189 })?;
1190 let source_stat = fstat(&source_fd).map_err(|e| {
1191 NucleusError::FilesystemError(format!(
1192 "Failed to stat secret source {:?}: {}",
1193 secret.source, e
1194 ))
1195 })?;
1196 let source_kind = SFlag::from_bits_truncate(source_stat.st_mode);
1197 let source_is_file = source_kind == SFlag::S_IFREG;
1198 let source_is_dir = source_kind == SFlag::S_IFDIR;
1199 if !source_is_file && !source_is_dir {
1200 return Err(NucleusError::FilesystemError(format!(
1201 "Secret source {:?} must be a regular file or directory",
1202 secret.source
1203 )));
1204 }
1205 let source_fd_path = PathBuf::from(format!("/proc/self/fd/{}", source_fd.as_raw_fd()));
1206
1207 let dest = resolve_container_destination(root, &secret.dest)?;
1209
1210 if let Some(parent) = dest.parent() {
1212 std::fs::create_dir_all(parent).map_err(|e| {
1213 NucleusError::FilesystemError(format!(
1214 "Failed to create secret mount parent {:?}: {}",
1215 parent, e
1216 ))
1217 })?;
1218 }
1219
1220 if source_is_file {
1222 std::fs::write(&dest, "").map_err(|e| {
1223 NucleusError::FilesystemError(format!(
1224 "Failed to create secret mount point {:?}: {}",
1225 dest, e
1226 ))
1227 })?;
1228 } else {
1229 std::fs::create_dir_all(&dest).map_err(|e| {
1230 NucleusError::FilesystemError(format!(
1231 "Failed to create secret mount dir {:?}: {}",
1232 dest, e
1233 ))
1234 })?;
1235 }
1236
1237 mount(
1239 Some(source_fd_path.as_path()),
1240 &dest,
1241 None::<&str>,
1242 MsFlags::MS_BIND,
1243 None::<&str>,
1244 )
1245 .map_err(|e| {
1246 NucleusError::FilesystemError(format!(
1247 "Failed to bind mount secret {:?}: {}",
1248 secret.source, e
1249 ))
1250 })?;
1251
1252 mount(
1253 None::<&str>,
1254 &dest,
1255 None::<&str>,
1256 MsFlags::MS_REMOUNT
1257 | MsFlags::MS_BIND
1258 | MsFlags::MS_RDONLY
1259 | MsFlags::MS_NOSUID
1260 | MsFlags::MS_NODEV
1261 | MsFlags::MS_NOEXEC,
1262 None::<&str>,
1263 )
1264 .map_err(|e| {
1265 NucleusError::FilesystemError(format!(
1266 "Failed to remount secret {:?} read-only: {}",
1267 dest, e
1268 ))
1269 })?;
1270
1271 if source_is_file {
1273 use std::os::unix::fs::PermissionsExt;
1274 let perms = std::fs::Permissions::from_mode(secret.mode);
1275 if let Err(e) = std::fs::set_permissions(&dest, perms) {
1276 warn!(
1277 "Failed to set mode {:04o} on secret {:?}: {} (bind mount may override)",
1278 secret.mode, dest, e
1279 );
1280 }
1281 }
1282
1283 debug!(
1284 "Mounted secret {:?} -> {:?} (mode {:04o})",
1285 secret.source, secret.dest, secret.mode
1286 );
1287 }
1288
1289 Ok(())
1290}
1291
1292pub fn mount_secrets_inmemory(
1298 root: &Path,
1299 secrets: &[crate::container::SecretMount],
1300 identity: &crate::container::ProcessIdentity,
1301) -> Result<()> {
1302 if secrets.is_empty() {
1303 return Ok(());
1304 }
1305
1306 info!("Mounting {} secret(s) on in-memory tmpfs", secrets.len());
1307
1308 let secrets_dir = root.join("run/secrets");
1309 std::fs::create_dir_all(&secrets_dir).map_err(|e| {
1310 NucleusError::FilesystemError(format!(
1311 "Failed to create secrets dir {:?}: {}",
1312 secrets_dir, e
1313 ))
1314 })?;
1315
1316 if let Err(e) = mount(
1318 Some("tmpfs"),
1319 &secrets_dir,
1320 Some("tmpfs"),
1321 MsFlags::MS_NOSUID | MsFlags::MS_NODEV | MsFlags::MS_NOEXEC,
1322 Some("size=16m,mode=0700"),
1323 ) {
1324 let _ = std::fs::remove_dir_all(&secrets_dir);
1325 return Err(NucleusError::FilesystemError(format!(
1326 "Failed to mount secrets tmpfs at {:?}: {}",
1327 secrets_dir, e
1328 )));
1329 }
1330
1331 if !identity.is_root() {
1332 nix::unistd::chown(
1333 &secrets_dir,
1334 Some(nix::unistd::Uid::from_raw(identity.uid)),
1335 Some(nix::unistd::Gid::from_raw(identity.gid)),
1336 )
1337 .map_err(|e| {
1338 let _ = nix::mount::umount2(&secrets_dir, nix::mount::MntFlags::MNT_DETACH);
1339 let _ = std::fs::remove_dir_all(&secrets_dir);
1340 NucleusError::FilesystemError(format!(
1341 "Failed to set /run/secrets owner to {}:{}: {}",
1342 identity.uid, identity.gid, e
1343 ))
1344 })?;
1345 }
1346
1347 let result = mount_secrets_inmemory_inner(&secrets_dir, root, secrets, identity);
1349 if let Err(ref e) = result {
1350 let _ = nix::mount::umount2(&secrets_dir, nix::mount::MntFlags::MNT_DETACH);
1351 let _ = std::fs::remove_dir_all(&secrets_dir);
1352 return Err(NucleusError::FilesystemError(format!(
1353 "Secret mount failed (rolled back): {}",
1354 e
1355 )));
1356 }
1357
1358 info!("All secrets mounted on in-memory tmpfs");
1359 Ok(())
1360}
1361
1362fn mount_secrets_inmemory_inner(
1363 secrets_dir: &Path,
1364 root: &Path,
1365 secrets: &[crate::container::SecretMount],
1366 identity: &crate::container::ProcessIdentity,
1367) -> Result<()> {
1368 for secret in secrets {
1369 let mut content = read_regular_file_nofollow(&secret.source)?;
1370
1371 let dest = resolve_container_destination(secrets_dir, &secret.dest)?;
1373
1374 if let Some(parent) = dest.parent() {
1376 std::fs::create_dir_all(parent).map_err(|e| {
1377 NucleusError::FilesystemError(format!(
1378 "Failed to create secret parent dir {:?}: {}",
1379 parent, e
1380 ))
1381 })?;
1382 }
1383
1384 std::fs::write(&dest, &content).map_err(|e| {
1386 NucleusError::FilesystemError(format!("Failed to write secret to {:?}: {}", dest, e))
1387 })?;
1388
1389 {
1391 use std::os::unix::fs::PermissionsExt;
1392 let perms = std::fs::Permissions::from_mode(secret.mode);
1393 std::fs::set_permissions(&dest, perms).map_err(|e| {
1394 NucleusError::FilesystemError(format!(
1395 "Failed to set permissions on secret {:?}: {}",
1396 dest, e
1397 ))
1398 })?;
1399 }
1400
1401 if !identity.is_root() {
1402 nix::unistd::chown(
1403 &dest,
1404 Some(nix::unistd::Uid::from_raw(identity.uid)),
1405 Some(nix::unistd::Gid::from_raw(identity.gid)),
1406 )
1407 .map_err(|e| {
1408 NucleusError::FilesystemError(format!(
1409 "Failed to set permissions owner on secret {:?} to {}:{}: {}",
1410 dest, identity.uid, identity.gid, e
1411 ))
1412 })?;
1413 }
1414
1415 zeroize::Zeroize::zeroize(&mut content);
1417 drop(content);
1418
1419 let container_dest = resolve_container_destination(root, &secret.dest)?;
1421 if container_dest != dest {
1422 if let Some(parent) = container_dest.parent() {
1423 std::fs::create_dir_all(parent).map_err(|e| {
1424 NucleusError::FilesystemError(format!(
1425 "Failed to create secret mount parent {:?}: {}",
1426 parent, e
1427 ))
1428 })?;
1429 }
1430
1431 std::fs::write(&container_dest, "").map_err(|e| {
1432 NucleusError::FilesystemError(format!(
1433 "Failed to create secret mount point {:?}: {}",
1434 container_dest, e
1435 ))
1436 })?;
1437
1438 mount(
1439 Some(dest.as_path()),
1440 &container_dest,
1441 None::<&str>,
1442 MsFlags::MS_BIND,
1443 None::<&str>,
1444 )
1445 .map_err(|e| {
1446 NucleusError::FilesystemError(format!(
1447 "Failed to bind mount secret {:?} -> {:?}: {}",
1448 dest, container_dest, e
1449 ))
1450 })?;
1451
1452 mount(
1453 None::<&str>,
1454 &container_dest,
1455 None::<&str>,
1456 MsFlags::MS_REMOUNT
1457 | MsFlags::MS_BIND
1458 | MsFlags::MS_RDONLY
1459 | MsFlags::MS_NOSUID
1460 | MsFlags::MS_NODEV
1461 | MsFlags::MS_NOEXEC,
1462 None::<&str>,
1463 )
1464 .map_err(|e| {
1465 NucleusError::FilesystemError(format!(
1466 "Failed to remount secret {:?} read-only: {}",
1467 container_dest, e
1468 ))
1469 })?;
1470 }
1471
1472 debug!(
1473 "Secret {:?} -> {:?} (in-memory tmpfs, mode {:04o})",
1474 secret.source, secret.dest, secret.mode
1475 );
1476 }
1477
1478 Ok(())
1479}
1480
1481#[cfg(test)]
1482mod tests {
1483 use super::*;
1484 use std::os::unix::fs::symlink;
1485
1486 #[test]
1487 fn test_validate_bind_mount_source_rejects_sensitive_subtrees() {
1488 for path in ["/proc/sys", "/sys/fs/cgroup", "/dev/kmsg", "/boot"] {
1489 let err = validate_bind_mount_source(Path::new(path)).unwrap_err();
1490 assert!(
1491 err.to_string().contains("sensitive host path"),
1492 "expected sensitive-path rejection for {path}, got: {err}"
1493 );
1494 }
1495 }
1496
1497 #[test]
1498 fn test_validate_bind_mount_source_allows_regular_host_paths() {
1499 let temp = tempfile::TempDir::new().unwrap();
1500 let safe_path = temp.path().join("data");
1501 std::fs::create_dir(&safe_path).unwrap();
1502
1503 validate_bind_mount_source(&safe_path).unwrap();
1504 }
1505
1506 #[test]
1507 fn test_validate_bind_mount_source_normalizes_parent_components_before_filtering() {
1508 let temp = tempfile::TempDir::new().unwrap();
1509 let safe_path = temp.path().join("data");
1510 std::fs::create_dir(&safe_path).unwrap();
1511
1512 validate_bind_mount_source(&safe_path.join("../data")).unwrap();
1513 }
1514
1515 #[test]
1516 fn test_proc_mask_includes_sysrq_trigger() {
1517 assert!(
1518 PROC_NULL_MASKED.contains(&"sysrq-trigger"),
1519 "/proc/sysrq-trigger must be masked to prevent host DoS"
1520 );
1521 }
1522
1523 #[test]
1524 fn test_proc_mask_includes_timer_stats() {
1525 assert!(
1526 PROC_NULL_MASKED.contains(&"timer_stats"),
1527 "/proc/timer_stats must be masked to prevent kernel info leakage"
1528 );
1529 }
1530
1531 #[test]
1532 fn test_proc_mask_includes_kpage_files() {
1533 for path in &["kpagecount", "kpageflags", "kpagecgroup"] {
1534 assert!(
1535 PROC_NULL_MASKED.contains(path),
1536 "/proc/{} must be masked to prevent host memory layout leakage",
1537 path
1538 );
1539 }
1540 }
1541
1542 #[test]
1543 fn test_proc_mask_includes_oci_standard_paths() {
1544 for path in &["kallsyms", "kcore", "sched_debug", "keys", "config.gz"] {
1546 assert!(
1547 PROC_NULL_MASKED.contains(path),
1548 "/proc/{} must be in null-masked list (OCI spec)",
1549 path
1550 );
1551 }
1552 for path in &["acpi", "scsi"] {
1553 assert!(
1554 PROC_TMPFS_MASKED.contains(path),
1555 "/proc/{} must be in tmpfs-masked list (OCI spec)",
1556 path
1557 );
1558 }
1559 for path in &["bus", "fs", "irq", "sys"] {
1560 assert!(
1561 PROC_READONLY_PATHS.contains(path),
1562 "/proc/{} must be in read-only remount list (OCI spec)",
1563 path
1564 );
1565 assert!(
1566 !PROC_TMPFS_MASKED.contains(path),
1567 "/proc/{} must stay visible read-only, not hidden behind tmpfs",
1568 path
1569 );
1570 }
1571 }
1572
1573 #[test]
1574 fn test_parse_mountinfo_line_uses_mountinfo_mount_point_and_flags() {
1575 let line =
1576 "36 25 0:32 / /run/secrets rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,size=1024k";
1577 let (mount_point, flags) = parse_mountinfo_line(line).unwrap();
1578
1579 assert_eq!(mount_point, "/run/secrets");
1580 assert!(flags.contains("nosuid"));
1581 assert!(flags.contains("nodev"));
1582 assert!(flags.contains("noexec"));
1583 }
1584
1585 #[test]
1586 fn test_parse_mountinfo_line_decodes_escaped_mount_points() {
1587 let line = "41 25 0:40 / /path\\040with\\040spaces ro,nosuid,nodev - ext4 /dev/root ro";
1588 let (mount_point, flags) = parse_mountinfo_line(line).unwrap();
1589
1590 assert_eq!(mount_point, "/path with spaces");
1591 assert!(flags.contains("ro"));
1592 }
1593
1594 #[test]
1595 fn test_chroot_impl_closes_non_stdio_fds() {
1596 let source = include_str!("mount.rs");
1597 let fn_start = source.find("fn chroot_impl").unwrap();
1598 let after = &source[fn_start..];
1599 let open = after.find('{').unwrap();
1600 let mut depth = 0u32;
1601 let mut fn_end = open;
1602 for (i, ch) in after[open..].char_indices() {
1603 match ch {
1604 '{' => depth += 1,
1605 '}' => {
1606 depth -= 1;
1607 if depth == 0 {
1608 fn_end = open + i + 1;
1609 break;
1610 }
1611 }
1612 _ => {}
1613 }
1614 }
1615 let body = &after[..fn_end];
1616 assert!(
1617 body.contains("close_non_stdio_fds_after_chroot()?"),
1618 "chroot fallback must close inherited non-stdio fds before continuing setup"
1619 );
1620 }
1621
1622 #[test]
1623 fn test_read_regular_file_nofollow_reads_regular_file() {
1624 let temp = tempfile::tempdir().unwrap();
1625 let path = temp.path().join("secret.txt");
1626 std::fs::write(&path, "supersecret").unwrap();
1627
1628 let content = read_regular_file_nofollow(&path).unwrap();
1629 assert_eq!(content, b"supersecret");
1630 }
1631
1632 #[test]
1633 fn test_read_regular_file_nofollow_rejects_symlink() {
1634 let temp = tempfile::tempdir().unwrap();
1635 let target = temp.path().join("target.txt");
1636 let link = temp.path().join("secret-link");
1637 std::fs::write(&target, "supersecret").unwrap();
1638 symlink(&target, &link).unwrap();
1639
1640 let err = read_regular_file_nofollow(&link).unwrap_err();
1641 assert!(
1642 err.to_string().contains("O_NOFOLLOW"),
1643 "symlink reads must fail via O_NOFOLLOW"
1644 );
1645 }
1646}