1use crate::container::OciStatus;
2use crate::error::{NucleusError, Result};
3use crate::filesystem::{normalize_container_destination, normalize_volume_destination};
4use crate::isolation::{IdMapping, NamespaceConfig, UserNamespaceConfig};
5use crate::resources::ResourceLimits;
6use serde::{Deserialize, Serialize};
7use std::collections::{BTreeSet, HashMap};
8use std::ffi::CString;
9use std::fs;
10use std::fs::OpenOptions;
11use std::io::Write;
12use std::os::fd::{AsRawFd, FromRawFd};
13use std::os::unix::fs::{OpenOptionsExt, PermissionsExt};
14use std::path::{Path, PathBuf};
15use tracing::{debug, info, warn};
16
17#[derive(Debug, Clone, Serialize, Deserialize)]
22pub struct OciConfig {
23 #[serde(rename = "ociVersion")]
24 pub oci_version: String,
25
26 pub root: OciRoot,
27 pub process: OciProcess,
28 pub hostname: Option<String>,
29 pub mounts: Vec<OciMount>,
30 pub linux: Option<OciLinux>,
31 #[serde(default, skip_serializing_if = "Option::is_none")]
32 pub hooks: Option<OciHooks>,
33 #[serde(default, skip_serializing_if = "HashMap::is_empty")]
34 pub annotations: HashMap<String, String>,
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct OciRoot {
39 pub path: String,
40 pub readonly: bool,
41}
42
43#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct OciProcess {
45 pub terminal: bool,
46 pub user: OciUser,
47 pub args: Vec<String>,
48 pub env: Vec<String>,
49 pub cwd: String,
50 #[serde(rename = "noNewPrivileges")]
51 pub no_new_privileges: bool,
52 pub capabilities: Option<OciCapabilities>,
53 #[serde(default, skip_serializing_if = "Vec::is_empty")]
54 pub rlimits: Vec<OciRlimit>,
55 #[serde(
56 rename = "consoleSize",
57 default,
58 skip_serializing_if = "Option::is_none"
59 )]
60 pub console_size: Option<OciConsoleSize>,
61 #[serde(
62 rename = "apparmorProfile",
63 default,
64 skip_serializing_if = "Option::is_none"
65 )]
66 pub apparmor_profile: Option<String>,
67 #[serde(
68 rename = "selinuxLabel",
69 default,
70 skip_serializing_if = "Option::is_none"
71 )]
72 pub selinux_label: Option<String>,
73}
74
75#[derive(Debug, Clone, Serialize, Deserialize)]
76pub struct OciUser {
77 pub uid: u32,
78 pub gid: u32,
79 #[serde(skip_serializing_if = "Option::is_none")]
80 pub additional_gids: Option<Vec<u32>>,
81}
82
83#[derive(Debug, Clone, Serialize, Deserialize)]
84pub struct OciCapabilities {
85 pub bounding: Vec<String>,
86 pub effective: Vec<String>,
87 pub inheritable: Vec<String>,
88 pub permitted: Vec<String>,
89 pub ambient: Vec<String>,
90}
91
92#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct OciMount {
94 pub destination: String,
95 pub source: String,
96 #[serde(rename = "type")]
97 pub mount_type: String,
98 pub options: Vec<String>,
99}
100
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct OciLinux {
103 #[serde(skip_serializing_if = "Option::is_none")]
104 pub namespaces: Option<Vec<OciNamespace>>,
105 #[serde(skip_serializing_if = "Option::is_none")]
106 pub resources: Option<OciResources>,
107 #[serde(rename = "uidMappings", skip_serializing_if = "Vec::is_empty", default)]
108 pub uid_mappings: Vec<OciIdMapping>,
109 #[serde(rename = "gidMappings", skip_serializing_if = "Vec::is_empty", default)]
110 pub gid_mappings: Vec<OciIdMapping>,
111 #[serde(rename = "maskedPaths", skip_serializing_if = "Vec::is_empty", default)]
112 pub masked_paths: Vec<String>,
113 #[serde(
114 rename = "readonlyPaths",
115 skip_serializing_if = "Vec::is_empty",
116 default
117 )]
118 pub readonly_paths: Vec<String>,
119 #[serde(default, skip_serializing_if = "Vec::is_empty")]
120 pub devices: Vec<OciDevice>,
121 #[serde(default, skip_serializing_if = "Option::is_none")]
122 pub seccomp: Option<OciSeccomp>,
123 #[serde(
124 rename = "rootfsPropagation",
125 default,
126 skip_serializing_if = "Option::is_none"
127 )]
128 pub rootfs_propagation: Option<String>,
129 #[serde(default, skip_serializing_if = "HashMap::is_empty")]
130 pub sysctl: HashMap<String, String>,
131 #[serde(
132 rename = "cgroupsPath",
133 default,
134 skip_serializing_if = "Option::is_none"
135 )]
136 pub cgroups_path: Option<String>,
137 #[serde(rename = "intelRdt", default, skip_serializing_if = "Option::is_none")]
138 pub intel_rdt: Option<OciIntelRdt>,
139}
140
141#[derive(Debug, Clone, Serialize, Deserialize)]
142pub struct OciNamespace {
143 #[serde(rename = "type")]
144 pub namespace_type: String,
145}
146
147#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
148pub struct OciIdMapping {
149 #[serde(rename = "containerID")]
150 pub container_id: u32,
151 #[serde(rename = "hostID")]
152 pub host_id: u32,
153 pub size: u32,
154}
155
156#[derive(Debug, Clone, Serialize, Deserialize)]
157pub struct OciResources {
158 #[serde(skip_serializing_if = "Option::is_none")]
159 pub memory: Option<OciMemory>,
160 #[serde(skip_serializing_if = "Option::is_none")]
161 pub cpu: Option<OciCpu>,
162 #[serde(skip_serializing_if = "Option::is_none")]
163 pub pids: Option<OciPids>,
164}
165
166#[derive(Debug, Clone, Serialize, Deserialize)]
167pub struct OciMemory {
168 #[serde(skip_serializing_if = "Option::is_none")]
169 pub limit: Option<i64>,
170}
171
172#[derive(Debug, Clone, Serialize, Deserialize)]
173pub struct OciCpu {
174 #[serde(skip_serializing_if = "Option::is_none")]
175 pub quota: Option<i64>,
176 #[serde(skip_serializing_if = "Option::is_none")]
177 pub period: Option<u64>,
178}
179
180#[derive(Debug, Clone, Serialize, Deserialize)]
181pub struct OciPids {
182 pub limit: i64,
183}
184
185#[derive(Debug, Clone, Serialize, Deserialize)]
189pub struct OciRlimit {
190 #[serde(rename = "type")]
192 pub limit_type: String,
193 pub hard: u64,
195 pub soft: u64,
197}
198
199#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct OciConsoleSize {
202 pub height: u32,
203 pub width: u32,
204}
205
206#[derive(Debug, Clone, Serialize, Deserialize)]
210pub struct OciDevice {
211 #[serde(rename = "type")]
213 pub device_type: String,
214 pub path: String,
216 #[serde(skip_serializing_if = "Option::is_none")]
218 pub major: Option<i64>,
219 #[serde(skip_serializing_if = "Option::is_none")]
221 pub minor: Option<i64>,
222 #[serde(rename = "fileMode", skip_serializing_if = "Option::is_none")]
224 pub file_mode: Option<u32>,
225 #[serde(skip_serializing_if = "Option::is_none")]
227 pub uid: Option<u32>,
228 #[serde(skip_serializing_if = "Option::is_none")]
230 pub gid: Option<u32>,
231}
232
233#[derive(Debug, Clone, Serialize, Deserialize)]
237pub struct OciSeccomp {
238 #[serde(rename = "defaultAction")]
240 pub default_action: String,
241 #[serde(default, skip_serializing_if = "Vec::is_empty")]
243 pub architectures: Vec<String>,
244 #[serde(default, skip_serializing_if = "Vec::is_empty")]
246 pub syscalls: Vec<OciSeccompSyscall>,
247}
248
249#[derive(Debug, Clone, Serialize, Deserialize)]
251pub struct OciSeccompSyscall {
252 pub names: Vec<String>,
254 pub action: String,
256 #[serde(default, skip_serializing_if = "Vec::is_empty")]
258 pub args: Vec<OciSeccompArg>,
259}
260
261#[derive(Debug, Clone, Serialize, Deserialize)]
263pub struct OciSeccompArg {
264 pub index: u32,
266 pub value: u64,
268 #[serde(rename = "valueTwo", default, skip_serializing_if = "is_zero")]
270 pub value_two: u64,
271 pub op: String,
273}
274
275fn is_zero(v: &u64) -> bool {
276 *v == 0
277}
278
279#[derive(Debug, Clone, Serialize, Deserialize)]
283pub struct OciIntelRdt {
284 #[serde(rename = "closID", default, skip_serializing_if = "Option::is_none")]
286 pub clos_id: Option<String>,
287 #[serde(
289 rename = "l3CacheSchema",
290 default,
291 skip_serializing_if = "Option::is_none"
292 )]
293 pub l3_cache_schema: Option<String>,
294 #[serde(
296 rename = "memBwSchema",
297 default,
298 skip_serializing_if = "Option::is_none"
299 )]
300 pub mem_bw_schema: Option<String>,
301}
302
303#[derive(Debug, Clone, Serialize, Deserialize)]
307pub struct OciHook {
308 pub path: String,
310 #[serde(default, skip_serializing_if = "Vec::is_empty")]
312 pub args: Vec<String>,
313 #[serde(default, skip_serializing_if = "Vec::is_empty")]
315 pub env: Vec<String>,
316 #[serde(default, skip_serializing_if = "Option::is_none")]
318 pub timeout: Option<u32>,
319}
320
321#[derive(Debug, Clone, Default, Serialize, Deserialize)]
325pub struct OciHooks {
326 #[serde(
328 rename = "createRuntime",
329 default,
330 skip_serializing_if = "Vec::is_empty"
331 )]
332 pub create_runtime: Vec<OciHook>,
333 #[serde(
335 rename = "createContainer",
336 default,
337 skip_serializing_if = "Vec::is_empty"
338 )]
339 pub create_container: Vec<OciHook>,
340 #[serde(
342 rename = "startContainer",
343 default,
344 skip_serializing_if = "Vec::is_empty"
345 )]
346 pub start_container: Vec<OciHook>,
347 #[serde(default, skip_serializing_if = "Vec::is_empty")]
349 pub poststart: Vec<OciHook>,
350 #[serde(default, skip_serializing_if = "Vec::is_empty")]
352 pub poststop: Vec<OciHook>,
353}
354
355#[derive(Debug, Clone, Serialize)]
359pub struct OciContainerState {
360 #[serde(rename = "ociVersion")]
361 pub oci_version: String,
362 pub id: String,
363 pub status: OciStatus,
364 pub pid: u32,
365 pub bundle: String,
366}
367
368impl OciHooks {
369 pub fn is_empty(&self) -> bool {
371 self.create_runtime.is_empty()
372 && self.create_container.is_empty()
373 && self.start_container.is_empty()
374 && self.poststart.is_empty()
375 && self.poststop.is_empty()
376 }
377
378 pub fn run_hooks(hooks: &[OciHook], state: &OciContainerState, phase: &str) -> Result<()> {
382 let state_json = serde_json::to_string(state).map_err(|e| {
383 NucleusError::HookError(format!(
384 "Failed to serialize container state for hook: {}",
385 e
386 ))
387 })?;
388
389 for (i, hook) in hooks.iter().enumerate() {
390 info!(
391 "Running {} hook [{}/{}]: {}",
392 phase,
393 i + 1,
394 hooks.len(),
395 hook.path
396 );
397 Self::execute_hook(hook, &state_json, phase)?;
398 }
399
400 Ok(())
401 }
402
403 pub fn run_hooks_best_effort(hooks: &[OciHook], state: &OciContainerState, phase: &str) {
408 let state_json = match serde_json::to_string(state) {
409 Ok(json) => json,
410 Err(e) => {
411 warn!(
412 "Failed to serialize container state for {} hooks: {}",
413 phase, e
414 );
415 return;
416 }
417 };
418
419 for (i, hook) in hooks.iter().enumerate() {
420 info!(
421 "Running {} hook [{}/{}]: {}",
422 phase,
423 i + 1,
424 hooks.len(),
425 hook.path
426 );
427 if let Err(e) = Self::execute_hook(hook, &state_json, phase) {
428 warn!("{} hook [{}] failed (continuing): {}", phase, i + 1, e);
429 }
430 }
431 }
432
433 fn execute_hook(hook: &OciHook, state_json: &str, phase: &str) -> Result<()> {
434 #[cfg(not(test))]
435 use std::os::unix::process::CommandExt;
436 use std::process::{Command, Stdio};
437
438 let hook_path = Path::new(&hook.path);
439 if !hook_path.is_absolute() {
440 return Err(NucleusError::HookError(format!(
441 "{} hook path must be absolute: {}",
442 phase, hook.path
443 )));
444 }
445
446 #[cfg(not(test))]
450 {
451 const TRUSTED_HOOK_PREFIXES: &[&str] = &[
452 "/usr/bin/",
453 "/usr/sbin/",
454 "/usr/lib/",
455 "/usr/libexec/",
456 "/usr/local/bin/",
457 "/usr/local/sbin/",
458 "/usr/local/libexec/",
459 "/bin/",
460 "/sbin/",
461 "/nix/store/",
462 "/opt/",
463 ];
464 if !TRUSTED_HOOK_PREFIXES
465 .iter()
466 .any(|prefix| hook.path.starts_with(prefix))
467 {
468 return Err(NucleusError::HookError(format!(
469 "{} hook path '{}' is not under a trusted directory ({:?})",
470 phase, hook.path, TRUSTED_HOOK_PREFIXES
471 )));
472 }
473 }
474
475 match std::fs::symlink_metadata(hook_path) {
479 Ok(meta) if meta.file_type().is_symlink() => {
480 return Err(NucleusError::HookError(format!(
481 "{} hook path is a symlink (refusing to follow): {}",
482 phase, hook.path
483 )));
484 }
485 Err(_) => {
486 return Err(NucleusError::HookError(format!(
487 "{} hook binary not found: {}",
488 phase, hook.path
489 )));
490 }
491 Ok(_) => {}
492 }
493
494 Self::validate_hook_binary(hook_path, phase)?;
499
500 let mut cmd = Command::new(&hook.path);
501 if !hook.args.is_empty() {
502 cmd.args(&hook.args[1..]);
504 }
505
506 if !hook.env.is_empty() {
507 cmd.env_clear();
508 for entry in &hook.env {
509 if let Some((key, value)) = entry.split_once('=') {
510 cmd.env(key, value);
511 }
512 }
513 }
514
515 cmd.stdin(Stdio::piped());
519 cmd.stdout(Stdio::piped());
520 cmd.stderr(Stdio::piped());
521
522 #[cfg(not(test))]
526 unsafe {
527 cmd.pre_exec(|| {
528 if libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0 {
531 return Err(std::io::Error::last_os_error());
532 }
533
534 let rlim_nproc = libc::rlimit {
535 rlim_cur: 1024,
536 rlim_max: 1024,
537 };
538 if libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc) != 0 {
539 return Err(std::io::Error::last_os_error());
540 }
541
542 let rlim_nofile = libc::rlimit {
543 rlim_cur: 1024,
544 rlim_max: 1024,
545 };
546 if libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile) != 0 {
547 return Err(std::io::Error::last_os_error());
548 }
549
550 Ok(())
551 });
552 }
553
554 const TEXT_FILE_BUSY_SPAWN_RETRIES: usize = 100;
555 const TEXT_FILE_BUSY_RETRY_DELAY: std::time::Duration =
556 std::time::Duration::from_millis(10);
557
558 let mut text_file_busy_retries = 0;
559 let mut child = loop {
560 match cmd.spawn() {
561 Ok(child) => break child,
562 Err(e)
563 if e.raw_os_error() == Some(libc::ETXTBSY)
564 && text_file_busy_retries < TEXT_FILE_BUSY_SPAWN_RETRIES =>
565 {
566 text_file_busy_retries += 1;
567 debug!(
568 "{} hook {} was busy during spawn; retrying ({}/{})",
569 phase, hook.path, text_file_busy_retries, TEXT_FILE_BUSY_SPAWN_RETRIES
570 );
571 std::thread::sleep(TEXT_FILE_BUSY_RETRY_DELAY);
572 }
573 Err(e) => {
574 return Err(NucleusError::HookError(format!(
575 "Failed to spawn {} hook {}: {}",
576 phase, hook.path, e
577 )));
578 }
579 }
580 };
581
582 if let Some(mut stdin) = child.stdin.take() {
583 use std::io::Write as IoWrite;
584 let _ = stdin.write_all(state_json.as_bytes());
585 }
586
587 let timeout_secs = hook.timeout.unwrap_or(30) as u64;
588 let start = std::time::Instant::now();
589 let timeout = std::time::Duration::from_secs(timeout_secs);
590
591 loop {
592 match child.try_wait() {
593 Ok(Some(status)) => {
594 if status.success() {
595 debug!("{} hook {} completed successfully", phase, hook.path);
596 return Ok(());
597 } else {
598 let stderr = child
599 .stderr
600 .take()
601 .map(|mut e| {
602 let mut buf = String::new();
603 use std::io::Read;
604 let _ = e.read_to_string(&mut buf);
605 buf
606 })
607 .unwrap_or_default();
608 return Err(NucleusError::HookError(format!(
609 "{} hook {} exited with status: {}{}",
610 phase,
611 hook.path,
612 status,
613 if stderr.is_empty() {
614 String::new()
615 } else {
616 format!(" (stderr: {})", stderr.trim())
617 }
618 )));
619 }
620 }
621 Ok(None) => {
622 if start.elapsed() >= timeout {
623 let _ = child.kill();
624 let _ = child.wait();
625 return Err(NucleusError::HookError(format!(
626 "{} hook {} timed out after {}s",
627 phase, hook.path, timeout_secs
628 )));
629 }
630 std::thread::sleep(std::time::Duration::from_millis(50));
631 }
632 Err(e) => {
633 return Err(NucleusError::HookError(format!(
634 "Failed to wait for {} hook {}: {}",
635 phase, hook.path, e
636 )));
637 }
638 }
639 }
640 }
641
642 fn validate_hook_binary(hook_path: &Path, phase: &str) -> Result<()> {
648 let metadata = std::fs::symlink_metadata(hook_path).map_err(|e| {
652 NucleusError::HookError(format!(
653 "Failed to stat {} hook {}: {}",
654 phase,
655 hook_path.display(),
656 e
657 ))
658 })?;
659
660 use std::os::unix::fs::MetadataExt;
661 let mode = metadata.mode();
662 let uid = metadata.uid();
663 let gid = metadata.gid();
664 let effective_uid = nix::unistd::Uid::effective().as_raw();
665
666 if mode & 0o002 != 0 {
668 return Err(NucleusError::HookError(format!(
669 "{} hook {} is world-writable (mode {:04o}) – refusing to execute",
670 phase,
671 hook_path.display(),
672 mode & 0o7777
673 )));
674 }
675
676 if mode & 0o020 != 0 && uid != 0 {
678 return Err(NucleusError::HookError(format!(
679 "{} hook {} is group-writable and not owned by root (mode {:04o}, uid {}) – refusing to execute",
680 phase,
681 hook_path.display(),
682 mode & 0o7777,
683 uid
684 )));
685 }
686
687 if uid != 0 && uid != effective_uid {
689 return Err(NucleusError::HookError(format!(
690 "{} hook {} is owned by UID {} (expected 0 or {}) – refusing to execute",
691 phase,
692 hook_path.display(),
693 uid,
694 effective_uid
695 )));
696 }
697
698 if mode & 0o6000 != 0 {
700 return Err(NucleusError::HookError(format!(
701 "{} hook {} has setuid/setgid bits (mode {:04o}) – refusing to execute",
702 phase,
703 hook_path.display(),
704 mode & 0o7777
705 )));
706 }
707
708 debug!(
709 "{} hook {} validation passed (uid={}, gid={}, mode={:04o})",
710 phase,
711 hook_path.display(),
712 uid,
713 gid,
714 mode & 0o7777
715 );
716
717 Ok(())
718 }
719}
720
721impl OciConfig {
722 pub fn new(command: Vec<String>, hostname: Option<String>) -> Self {
724 Self {
725 oci_version: "1.0.2".to_string(),
726 root: OciRoot {
727 path: "rootfs".to_string(),
728 readonly: true,
729 },
730 process: OciProcess {
731 terminal: false,
732 user: OciUser {
733 uid: 0,
734 gid: 0,
735 additional_gids: None,
736 },
737 args: command,
738 env: vec![
739 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(),
740 ],
741 cwd: "/".to_string(),
742 no_new_privileges: true,
743 capabilities: Some(OciCapabilities {
744 bounding: vec![],
745 effective: vec![],
746 inheritable: vec![],
747 permitted: vec![],
748 ambient: vec![],
749 }),
750 rlimits: vec![],
751 console_size: None,
752 apparmor_profile: None,
753 selinux_label: None,
754 },
755 hostname,
756 mounts: vec![
757 OciMount {
758 destination: "/proc".to_string(),
759 source: "proc".to_string(),
760 mount_type: "proc".to_string(),
761 options: vec![
762 "nosuid".to_string(),
763 "noexec".to_string(),
764 "nodev".to_string(),
765 ],
766 },
767 OciMount {
768 destination: "/dev".to_string(),
769 source: "tmpfs".to_string(),
770 mount_type: "tmpfs".to_string(),
771 options: vec![
772 "nosuid".to_string(),
773 "noexec".to_string(),
774 "strictatime".to_string(),
775 "mode=755".to_string(),
776 "size=65536k".to_string(),
777 ],
778 },
779 OciMount {
780 destination: "/dev/shm".to_string(),
781 source: "shm".to_string(),
782 mount_type: "tmpfs".to_string(),
783 options: vec![
784 "nosuid".to_string(),
785 "noexec".to_string(),
786 "nodev".to_string(),
787 "mode=1777".to_string(),
788 "size=65536k".to_string(),
789 ],
790 },
791 OciMount {
792 destination: "/tmp".to_string(),
793 source: "tmpfs".to_string(),
794 mount_type: "tmpfs".to_string(),
795 options: vec![
796 "nosuid".to_string(),
797 "nodev".to_string(),
798 "noexec".to_string(),
799 "mode=1777".to_string(),
800 "size=65536k".to_string(),
801 ],
802 },
803 OciMount {
804 destination: "/sys".to_string(),
805 source: "sysfs".to_string(),
806 mount_type: "sysfs".to_string(),
807 options: vec![
808 "nosuid".to_string(),
809 "noexec".to_string(),
810 "nodev".to_string(),
811 "ro".to_string(),
812 ],
813 },
814 ],
815 hooks: None,
816 annotations: HashMap::new(),
817 linux: Some(OciLinux {
818 namespaces: Some(vec![
819 OciNamespace {
820 namespace_type: "pid".to_string(),
821 },
822 OciNamespace {
823 namespace_type: "network".to_string(),
824 },
825 OciNamespace {
826 namespace_type: "ipc".to_string(),
827 },
828 OciNamespace {
829 namespace_type: "uts".to_string(),
830 },
831 OciNamespace {
832 namespace_type: "mount".to_string(),
833 },
834 ]),
835 resources: None,
836 uid_mappings: vec![],
837 gid_mappings: vec![],
838 masked_paths: vec![
840 "/proc/acpi".to_string(),
841 "/proc/asound".to_string(),
842 "/proc/kcore".to_string(),
843 "/proc/keys".to_string(),
844 "/proc/latency_stats".to_string(),
845 "/proc/sched_debug".to_string(),
846 "/proc/scsi".to_string(),
847 "/proc/timer_list".to_string(),
848 "/proc/timer_stats".to_string(),
849 "/proc/sysrq-trigger".to_string(), "/proc/kpagecount".to_string(),
851 "/proc/kpageflags".to_string(),
852 "/proc/kpagecgroup".to_string(),
853 "/proc/config.gz".to_string(),
854 "/proc/kallsyms".to_string(),
855 "/sys/firmware".to_string(),
856 ],
857 readonly_paths: vec![
858 "/proc/bus".to_string(),
859 "/proc/fs".to_string(),
860 "/proc/irq".to_string(),
861 "/proc/sys".to_string(),
862 ],
863 devices: vec![
864 OciDevice {
865 device_type: "c".to_string(),
866 path: "/dev/null".to_string(),
867 major: Some(1),
868 minor: Some(3),
869 file_mode: Some(0o666),
870 uid: Some(0),
871 gid: Some(0),
872 },
873 OciDevice {
874 device_type: "c".to_string(),
875 path: "/dev/zero".to_string(),
876 major: Some(1),
877 minor: Some(5),
878 file_mode: Some(0o666),
879 uid: Some(0),
880 gid: Some(0),
881 },
882 OciDevice {
883 device_type: "c".to_string(),
884 path: "/dev/full".to_string(),
885 major: Some(1),
886 minor: Some(7),
887 file_mode: Some(0o666),
888 uid: Some(0),
889 gid: Some(0),
890 },
891 OciDevice {
892 device_type: "c".to_string(),
893 path: "/dev/random".to_string(),
894 major: Some(1),
895 minor: Some(8),
896 file_mode: Some(0o666),
897 uid: Some(0),
898 gid: Some(0),
899 },
900 OciDevice {
901 device_type: "c".to_string(),
902 path: "/dev/urandom".to_string(),
903 major: Some(1),
904 minor: Some(9),
905 file_mode: Some(0o666),
906 uid: Some(0),
907 gid: Some(0),
908 },
909 ],
910 seccomp: None,
911 rootfs_propagation: Some("rprivate".to_string()),
912 sysctl: HashMap::new(),
913 cgroups_path: None,
914 intel_rdt: None,
915 }),
916 }
917 }
918
919 pub fn with_resources(mut self, limits: &ResourceLimits) -> Self {
921 let mut resources = OciResources {
922 memory: None,
923 cpu: None,
924 pids: None,
925 };
926
927 if let Some(memory_bytes) = limits.memory_bytes {
928 resources.memory = Some(OciMemory {
929 limit: Some(memory_bytes as i64),
930 });
931 }
932
933 if let Some(quota_us) = limits.cpu_quota_us {
934 resources.cpu = Some(OciCpu {
935 quota: Some(quota_us as i64),
936 period: Some(limits.cpu_period_us),
937 });
938 }
939
940 if let Some(pids_max) = limits.pids_max {
941 resources.pids = Some(OciPids {
942 limit: pids_max as i64,
943 });
944 }
945
946 if let Some(linux) = &mut self.linux {
947 linux.resources = Some(resources);
948 }
949
950 self
951 }
952
953 pub fn with_no_new_privileges(mut self, enabled: bool) -> Self {
955 self.process.no_new_privileges = enabled;
956 self
957 }
958
959 pub fn with_env(mut self, vars: &[(String, String)]) -> Self {
961 for (key, value) in vars {
962 self.process.env.push(format!("{}={}", key, value));
963 }
964 self
965 }
966
967 pub fn with_sd_notify(mut self) -> Self {
969 if let Ok(notify_socket) = std::env::var("NOTIFY_SOCKET") {
970 self.process
971 .env
972 .push(format!("NOTIFY_SOCKET={}", notify_socket));
973 }
974 self
975 }
976
977 pub fn with_secret_mounts(mut self, secrets: &[crate::container::SecretMount]) -> Self {
979 for secret in secrets {
980 self.mounts.push(OciMount {
981 destination: secret.dest.to_string_lossy().to_string(),
982 source: secret.source.to_string_lossy().to_string(),
983 mount_type: "bind".to_string(),
984 options: vec![
985 "bind".to_string(),
986 "ro".to_string(),
987 "nosuid".to_string(),
988 "nodev".to_string(),
989 "noexec".to_string(),
990 ],
991 });
992 }
993 self
994 }
995
996 pub fn with_process_identity(mut self, identity: &crate::container::ProcessIdentity) -> Self {
998 self.process.user.uid = identity.uid;
999 self.process.user.gid = identity.gid;
1000 self.process.user.additional_gids = if identity.additional_gids.is_empty() {
1001 None
1002 } else {
1003 Some(identity.additional_gids.clone())
1004 };
1005 self
1006 }
1007
1008 pub fn with_inmemory_secret_mounts(
1012 mut self,
1013 stage_dir: &Path,
1014 secrets: &[crate::container::SecretMount],
1015 ) -> Result<Self> {
1016 self.mounts.push(OciMount {
1017 destination: "/run/secrets".to_string(),
1018 source: stage_dir.to_string_lossy().to_string(),
1019 mount_type: "bind".to_string(),
1020 options: vec![
1021 "bind".to_string(),
1022 "ro".to_string(),
1023 "nosuid".to_string(),
1024 "nodev".to_string(),
1025 "noexec".to_string(),
1026 ],
1027 });
1028
1029 for secret in secrets {
1030 let dest = normalize_container_destination(&secret.dest)?;
1031 if !secret.source.starts_with(stage_dir) {
1032 return Err(NucleusError::ConfigError(format!(
1033 "Staged secret source {:?} must live under {:?}",
1034 secret.source, stage_dir
1035 )));
1036 }
1037 self.mounts.push(OciMount {
1038 destination: dest.to_string_lossy().to_string(),
1039 source: secret.source.to_string_lossy().to_string(),
1040 mount_type: "bind".to_string(),
1041 options: vec![
1042 "bind".to_string(),
1043 "ro".to_string(),
1044 "nosuid".to_string(),
1045 "nodev".to_string(),
1046 "noexec".to_string(),
1047 ],
1048 });
1049 }
1050
1051 Ok(self)
1052 }
1053
1054 pub fn with_volume_mounts(mut self, volumes: &[crate::container::VolumeMount]) -> Result<Self> {
1056 use crate::container::VolumeSource;
1057
1058 for volume in volumes {
1059 let dest = normalize_volume_destination(&volume.dest)?;
1060 match &volume.source {
1061 VolumeSource::Bind { source } => {
1062 crate::filesystem::validate_bind_mount_source(source)?;
1063 let mut options = vec![
1064 "bind".to_string(),
1065 "nosuid".to_string(),
1066 "nodev".to_string(),
1067 ];
1068 if volume.read_only {
1069 options.push("ro".to_string());
1070 }
1071 self.mounts.push(OciMount {
1072 destination: dest.to_string_lossy().to_string(),
1073 source: source.to_string_lossy().to_string(),
1074 mount_type: "bind".to_string(),
1075 options,
1076 });
1077 }
1078 VolumeSource::Tmpfs { size } => {
1079 let mut options = vec![
1080 "nosuid".to_string(),
1081 "nodev".to_string(),
1082 "mode=0755".to_string(),
1083 ];
1084 if volume.read_only {
1085 options.push("ro".to_string());
1086 }
1087 if let Some(size) = size {
1088 options.push(format!("size={}", size));
1089 }
1090 self.mounts.push(OciMount {
1091 destination: dest.to_string_lossy().to_string(),
1092 source: "tmpfs".to_string(),
1093 mount_type: "tmpfs".to_string(),
1094 options,
1095 });
1096 }
1097 }
1098 }
1099
1100 Ok(self)
1101 }
1102
1103 pub fn with_context_bind(mut self, context_dir: &std::path::Path) -> Self {
1108 self.mounts.push(OciMount {
1109 destination: "/context".to_string(),
1110 source: context_dir.to_string_lossy().to_string(),
1111 mount_type: "bind".to_string(),
1112 options: vec![
1113 "bind".to_string(),
1114 "ro".to_string(),
1115 "nosuid".to_string(),
1116 "nodev".to_string(),
1117 ],
1118 });
1119 self
1120 }
1121
1122 pub fn with_rootfs_binds(mut self, rootfs_path: &std::path::Path) -> Self {
1124 let subdirs = ["bin", "sbin", "lib", "lib64", "usr", "etc", "nix"];
1125 for subdir in &subdirs {
1126 let source = rootfs_path.join(subdir);
1127 if source.exists() {
1128 self.mounts.push(OciMount {
1129 destination: format!("/{}", subdir),
1130 source: source.to_string_lossy().to_string(),
1131 mount_type: "bind".to_string(),
1132 options: vec![
1133 "bind".to_string(),
1134 "ro".to_string(),
1135 "nosuid".to_string(),
1136 "nodev".to_string(),
1137 ],
1138 });
1139 }
1140 }
1141 self
1142 }
1143
1144 pub fn with_namespace_config(mut self, config: &NamespaceConfig) -> Self {
1146 let mut namespaces = Vec::new();
1147
1148 if config.pid {
1149 namespaces.push(OciNamespace {
1150 namespace_type: "pid".to_string(),
1151 });
1152 }
1153 if config.net {
1154 namespaces.push(OciNamespace {
1155 namespace_type: "network".to_string(),
1156 });
1157 }
1158 if config.ipc {
1159 namespaces.push(OciNamespace {
1160 namespace_type: "ipc".to_string(),
1161 });
1162 }
1163 if config.uts {
1164 namespaces.push(OciNamespace {
1165 namespace_type: "uts".to_string(),
1166 });
1167 }
1168 if config.mnt {
1169 namespaces.push(OciNamespace {
1170 namespace_type: "mount".to_string(),
1171 });
1172 }
1173 if config.cgroup {
1174 namespaces.push(OciNamespace {
1175 namespace_type: "cgroup".to_string(),
1176 });
1177 }
1178 if config.time {
1179 namespaces.push(OciNamespace {
1180 namespace_type: "time".to_string(),
1181 });
1182 }
1183 if config.user {
1184 namespaces.push(OciNamespace {
1185 namespace_type: "user".to_string(),
1186 });
1187 }
1188
1189 if let Some(linux) = &mut self.linux {
1190 linux.namespaces = Some(namespaces);
1191 }
1192
1193 self
1194 }
1195
1196 pub fn with_host_runtime_binds(mut self) -> Self {
1202 let host_paths: BTreeSet<String> =
1205 ["/bin", "/sbin", "/usr", "/lib", "/lib64", "/nix/store"]
1206 .iter()
1207 .map(|s| s.to_string())
1208 .collect();
1209
1210 for host_path in host_paths {
1211 let source = Path::new(&host_path);
1212 if !source.exists() {
1213 continue;
1214 }
1215
1216 self.mounts.push(OciMount {
1217 destination: host_path.clone(),
1218 source: source.to_string_lossy().to_string(),
1219 mount_type: "bind".to_string(),
1220 options: vec![
1221 "bind".to_string(),
1222 "ro".to_string(),
1223 "nosuid".to_string(),
1224 "nodev".to_string(),
1225 ],
1226 });
1227 }
1228 self
1229 }
1230
1231 pub fn with_user_namespace(mut self) -> Self {
1233 if let Some(linux) = &mut self.linux {
1234 if let Some(namespaces) = &mut linux.namespaces {
1235 namespaces.push(OciNamespace {
1236 namespace_type: "user".to_string(),
1237 });
1238 }
1239 }
1240 self
1241 }
1242
1243 pub fn without_network_namespace(mut self) -> Self {
1246 if let Some(linux) = &mut self.linux {
1247 if let Some(namespaces) = &mut linux.namespaces {
1248 namespaces.retain(|ns| ns.namespace_type != "network");
1249 }
1250 }
1251
1252 self
1253 }
1254
1255 pub fn with_rootless_user_namespace(mut self, config: &UserNamespaceConfig) -> Self {
1262 if let Some(linux) = &mut self.linux {
1263 if let Some(namespaces) = &mut linux.namespaces {
1264 namespaces.retain(|ns| ns.namespace_type != "network");
1265 if !namespaces.iter().any(|ns| ns.namespace_type == "user") {
1266 namespaces.push(OciNamespace {
1267 namespace_type: "user".to_string(),
1268 });
1269 }
1270 }
1271 linux.uid_mappings = config.uid_mappings.iter().map(OciIdMapping::from).collect();
1272 linux.gid_mappings = config.gid_mappings.iter().map(OciIdMapping::from).collect();
1273 }
1274 self
1275 }
1276
1277 pub fn with_hooks(mut self, hooks: OciHooks) -> Self {
1279 if hooks.is_empty() {
1280 self.hooks = None;
1281 } else {
1282 self.hooks = Some(hooks);
1283 }
1284 self
1285 }
1286
1287 pub fn with_rlimits(mut self, limits: &ResourceLimits) -> Self {
1292 let mut rlimits = Vec::with_capacity(3);
1293
1294 if let Some(nproc_limit) = limits.pids_max {
1295 rlimits.push(OciRlimit {
1296 limit_type: "RLIMIT_NPROC".to_string(),
1297 hard: nproc_limit,
1298 soft: nproc_limit,
1299 });
1300 }
1301
1302 rlimits.push(OciRlimit {
1303 limit_type: "RLIMIT_NOFILE".to_string(),
1304 hard: 1024,
1305 soft: 1024,
1306 });
1307
1308 let memlock_limit = limits.memlock_bytes.unwrap_or(64 * 1024);
1309 rlimits.push(OciRlimit {
1310 limit_type: "RLIMIT_MEMLOCK".to_string(),
1311 hard: memlock_limit,
1312 soft: memlock_limit,
1313 });
1314
1315 self.process.rlimits = rlimits;
1316 self
1317 }
1318
1319 pub fn with_seccomp(mut self, seccomp: OciSeccomp) -> Self {
1321 if let Some(linux) = &mut self.linux {
1322 linux.seccomp = Some(seccomp);
1323 }
1324 self
1325 }
1326
1327 pub fn with_cgroups_path(mut self, path: String) -> Self {
1329 if let Some(linux) = &mut self.linux {
1330 linux.cgroups_path = Some(path);
1331 }
1332 self
1333 }
1334
1335 pub fn with_sysctl(mut self, sysctl: HashMap<String, String>) -> Self {
1337 if let Some(linux) = &mut self.linux {
1338 linux.sysctl = sysctl;
1339 }
1340 self
1341 }
1342
1343 pub fn with_annotations(mut self, annotations: HashMap<String, String>) -> Self {
1345 self.annotations = annotations;
1346 self
1347 }
1348}
1349
1350impl From<&IdMapping> for OciIdMapping {
1351 fn from(mapping: &IdMapping) -> Self {
1352 Self {
1353 container_id: mapping.container_id,
1354 host_id: mapping.host_id,
1355 size: mapping.count,
1356 }
1357 }
1358}
1359
1360pub struct OciBundle {
1364 bundle_path: PathBuf,
1365 config: OciConfig,
1366}
1367
1368fn safe_child_name(name: &str) -> std::io::Result<CString> {
1369 if name.is_empty() || name == "." || name == ".." || name.contains('/') {
1370 return Err(std::io::Error::new(
1371 std::io::ErrorKind::InvalidInput,
1372 "invalid path child name",
1373 ));
1374 }
1375
1376 CString::new(name).map_err(|_| {
1377 std::io::Error::new(
1378 std::io::ErrorKind::InvalidInput,
1379 "path child name contains NUL",
1380 )
1381 })
1382}
1383
1384fn open_dir_nofollow(path: &Path) -> std::io::Result<fs::File> {
1385 OpenOptions::new()
1386 .read(true)
1387 .custom_flags(libc::O_DIRECTORY | libc::O_NOFOLLOW | libc::O_CLOEXEC)
1388 .open(path)
1389}
1390
1391fn mkdirat_dir(parent: &fs::File, name: &str, mode: libc::mode_t) -> std::io::Result<()> {
1392 let name = safe_child_name(name)?;
1393 let result = unsafe { libc::mkdirat(parent.as_raw_fd(), name.as_ptr(), mode) };
1394
1395 if result == 0 {
1396 return Ok(());
1397 }
1398
1399 let err = std::io::Error::last_os_error();
1400 if err.raw_os_error() == Some(libc::EEXIST) {
1401 Ok(())
1402 } else {
1403 Err(err)
1404 }
1405}
1406
1407fn openat_dir_nofollow(parent: &fs::File, name: &str) -> std::io::Result<fs::File> {
1408 let name = safe_child_name(name)?;
1409 let fd = unsafe {
1410 libc::openat(
1411 parent.as_raw_fd(),
1412 name.as_ptr(),
1413 libc::O_RDONLY | libc::O_DIRECTORY | libc::O_NOFOLLOW | libc::O_CLOEXEC,
1414 )
1415 };
1416
1417 if fd < 0 {
1418 Err(std::io::Error::last_os_error())
1419 } else {
1420 Ok(unsafe { fs::File::from_raw_fd(fd) })
1421 }
1422}
1423
1424fn openat_file_nofollow(
1425 parent: &fs::File,
1426 name: &str,
1427 mode: libc::mode_t,
1428) -> std::io::Result<fs::File> {
1429 let name = safe_child_name(name)?;
1430 let fd = unsafe {
1431 libc::openat(
1432 parent.as_raw_fd(),
1433 name.as_ptr(),
1434 libc::O_WRONLY | libc::O_CREAT | libc::O_TRUNC | libc::O_NOFOLLOW | libc::O_CLOEXEC,
1435 mode,
1436 )
1437 };
1438
1439 if fd < 0 {
1440 Err(std::io::Error::last_os_error())
1441 } else {
1442 Ok(unsafe { fs::File::from_raw_fd(fd) })
1443 }
1444}
1445
1446impl OciBundle {
1447 pub fn new(bundle_path: PathBuf, config: OciConfig) -> Self {
1449 Self {
1450 bundle_path,
1451 config,
1452 }
1453 }
1454
1455 pub fn create(&self) -> Result<()> {
1457 info!("Creating OCI bundle at {:?}", self.bundle_path);
1458
1459 fs::create_dir_all(&self.bundle_path).map_err(|e| {
1462 NucleusError::GVisorError(format!(
1463 "Failed to create bundle directory {:?}: {}",
1464 self.bundle_path, e
1465 ))
1466 })?;
1467 let bundle_dir = open_dir_nofollow(&self.bundle_path).map_err(|e| {
1468 NucleusError::GVisorError(format!(
1469 "Failed to open bundle directory safely {:?}: {}",
1470 self.bundle_path, e
1471 ))
1472 })?;
1473 bundle_dir
1474 .set_permissions(fs::Permissions::from_mode(0o700))
1475 .map_err(|e| {
1476 NucleusError::GVisorError(format!(
1477 "Failed to secure bundle directory permissions {:?}: {}",
1478 self.bundle_path, e
1479 ))
1480 })?;
1481
1482 let rootfs = self.bundle_path.join("rootfs");
1486 mkdirat_dir(&bundle_dir, "rootfs", 0o755).map_err(|e| {
1487 NucleusError::GVisorError(format!("Failed to create rootfs directory: {}", e))
1488 })?;
1489 let rootfs_dir = openat_dir_nofollow(&bundle_dir, "rootfs").map_err(|e| {
1490 NucleusError::GVisorError(format!(
1491 "Failed to open rootfs directory safely {:?}: {}",
1492 rootfs, e
1493 ))
1494 })?;
1495 rootfs_dir
1500 .set_permissions(fs::Permissions::from_mode(0o755))
1501 .map_err(|e| {
1502 NucleusError::GVisorError(format!(
1503 "Failed to set rootfs directory permissions {:?}: {}",
1504 rootfs, e
1505 ))
1506 })?;
1507
1508 let config_path = self.bundle_path.join("config.json");
1510 let config_json = serde_json::to_string_pretty(&self.config).map_err(|e| {
1511 NucleusError::GVisorError(format!("Failed to serialize OCI config: {}", e))
1512 })?;
1513
1514 let mut file = openat_file_nofollow(&bundle_dir, "config.json", 0o600).map_err(|e| {
1515 NucleusError::GVisorError(format!(
1516 "Failed to open config.json safely {:?}: {}",
1517 config_path, e
1518 ))
1519 })?;
1520 file.set_permissions(fs::Permissions::from_mode(0o600))
1521 .map_err(|e| {
1522 NucleusError::GVisorError(format!(
1523 "Failed to set config.json permissions {:?}: {}",
1524 config_path, e
1525 ))
1526 })?;
1527 file.write_all(config_json.as_bytes()).map_err(|e| {
1528 NucleusError::GVisorError(format!("Failed to write config.json: {}", e))
1529 })?;
1530 file.sync_all()
1531 .map_err(|e| NucleusError::GVisorError(format!("Failed to sync config.json: {}", e)))?;
1532
1533 debug!("Created OCI bundle structure at {:?}", self.bundle_path);
1534
1535 Ok(())
1536 }
1537
1538 pub fn rootfs_path(&self) -> PathBuf {
1540 self.bundle_path.join("rootfs")
1541 }
1542
1543 pub fn bundle_path(&self) -> &Path {
1545 &self.bundle_path
1546 }
1547
1548 pub fn cleanup(&self) -> Result<()> {
1550 if self.bundle_path.exists() {
1551 fs::remove_dir_all(&self.bundle_path).map_err(|e| {
1552 NucleusError::GVisorError(format!("Failed to cleanup bundle: {}", e))
1553 })?;
1554 debug!("Cleaned up OCI bundle at {:?}", self.bundle_path);
1555 }
1556 Ok(())
1557 }
1558}
1559
1560#[cfg(test)]
1561mod tests {
1562 use super::*;
1563 use std::os::unix::fs::symlink;
1564 use tempfile::TempDir;
1565
1566 #[test]
1567 fn test_oci_config_new() {
1568 let config = OciConfig::new(vec!["/bin/sh".to_string()], Some("test".to_string()));
1569
1570 assert_eq!(config.oci_version, "1.0.2");
1571 assert_eq!(config.root.path, "rootfs");
1572 assert_eq!(config.process.args, vec!["/bin/sh"]);
1573 assert_eq!(config.hostname, Some("test".to_string()));
1574 }
1575
1576 #[test]
1577 fn test_oci_config_with_resources() {
1578 let limits = ResourceLimits::unlimited()
1579 .with_memory("512M")
1580 .unwrap()
1581 .with_cpu_cores(2.0)
1582 .unwrap();
1583
1584 let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_resources(&limits);
1585
1586 assert!(config.linux.is_some());
1587 let linux = config.linux.unwrap();
1588 assert!(linux.resources.is_some());
1589
1590 let resources = linux.resources.unwrap();
1591 assert!(resources.memory.is_some());
1592 assert!(resources.cpu.is_some());
1593 }
1594
1595 #[test]
1596 fn test_oci_bundle_create() {
1597 let temp_dir = TempDir::new().unwrap();
1598 let bundle_path = temp_dir.path().join("test-bundle");
1599
1600 let config = OciConfig::new(vec!["/bin/sh".to_string()], None);
1601 let bundle = OciBundle::new(bundle_path.clone(), config);
1602
1603 bundle.create().unwrap();
1604
1605 assert!(bundle_path.exists());
1606 assert!(bundle_path.join("rootfs").exists());
1607 assert!(bundle_path.join("config.json").exists());
1608
1609 bundle.cleanup().unwrap();
1610 assert!(!bundle_path.exists());
1611 }
1612
1613 #[test]
1614 fn test_oci_bundle_rejects_bundle_symlink() {
1615 let temp_dir = TempDir::new().unwrap();
1616 let bundle_path = temp_dir.path().join("test-bundle");
1617 let protected_host_dir = temp_dir.path().join("protected-host-dir");
1618
1619 fs::create_dir_all(&protected_host_dir).unwrap();
1620 fs::set_permissions(&protected_host_dir, fs::Permissions::from_mode(0o755)).unwrap();
1621 symlink(&protected_host_dir, &bundle_path).unwrap();
1622
1623 let config = OciConfig::new(vec!["/bin/sh".to_string()], None);
1624 let bundle = OciBundle::new(bundle_path.clone(), config);
1625
1626 let err = bundle.create().unwrap_err();
1627
1628 assert!(format!("{err}").contains("Failed to open bundle directory safely"));
1629 assert_eq!(
1630 fs::metadata(&protected_host_dir)
1631 .unwrap()
1632 .permissions()
1633 .mode()
1634 & 0o777,
1635 0o755
1636 );
1637 assert!(fs::symlink_metadata(&bundle_path)
1638 .unwrap()
1639 .file_type()
1640 .is_symlink());
1641 }
1642
1643 #[test]
1644 fn test_oci_bundle_rejects_rootfs_symlink() {
1645 let temp_dir = TempDir::new().unwrap();
1646 let bundle_path = temp_dir.path().join("test-bundle");
1647 let protected_host_dir = temp_dir.path().join("protected-host-dir");
1648
1649 fs::create_dir_all(&bundle_path).unwrap();
1650 fs::create_dir_all(&protected_host_dir).unwrap();
1651 fs::set_permissions(&protected_host_dir, fs::Permissions::from_mode(0o700)).unwrap();
1652 symlink(&protected_host_dir, bundle_path.join("rootfs")).unwrap();
1653
1654 let config = OciConfig::new(vec!["/bin/sh".to_string()], None);
1655 let bundle = OciBundle::new(bundle_path.clone(), config);
1656
1657 let err = bundle.create().unwrap_err();
1658
1659 assert!(format!("{err}").contains("Failed to open rootfs directory safely"));
1660 assert_eq!(
1661 fs::metadata(&protected_host_dir)
1662 .unwrap()
1663 .permissions()
1664 .mode()
1665 & 0o777,
1666 0o700
1667 );
1668 assert!(fs::symlink_metadata(bundle_path.join("rootfs"))
1669 .unwrap()
1670 .file_type()
1671 .is_symlink());
1672 }
1673
1674 #[test]
1675 fn test_oci_config_serialization() {
1676 let config = OciConfig::new(vec!["/bin/sh".to_string()], Some("test".to_string()));
1677
1678 let json = serde_json::to_string_pretty(&config).unwrap();
1679 assert!(json.contains("ociVersion"));
1680 assert!(json.contains("1.0.2"));
1681 assert!(json.contains("/bin/sh"));
1682
1683 let deserialized: OciConfig = serde_json::from_str(&json).unwrap();
1685 assert_eq!(deserialized.oci_version, config.oci_version);
1686 assert_eq!(deserialized.process.args, config.process.args);
1687 }
1688
1689 #[test]
1690 fn test_host_runtime_binds_uses_fixed_paths_not_host_path() {
1691 std::env::set_var("PATH", "/tmp/evil-inject-path/bin:/opt/attacker/sbin");
1696 let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_host_runtime_binds();
1697 let mount_dests: Vec<&str> = config
1698 .mounts
1699 .iter()
1700 .map(|m| m.destination.as_str())
1701 .collect();
1702 let mount_srcs: Vec<&str> = config.mounts.iter().map(|m| m.source.as_str()).collect();
1703 for path in &["/tmp/evil-inject-path", "/opt/attacker"] {
1705 assert!(
1706 !mount_dests.iter().any(|d| d.contains(path)),
1707 "with_host_runtime_binds must not use host $PATH – found {:?} in mount destinations",
1708 path
1709 );
1710 assert!(
1711 !mount_srcs.iter().any(|s| s.contains(path)),
1712 "with_host_runtime_binds must not use host $PATH – found {:?} in mount sources",
1713 path
1714 );
1715 }
1716 let allowed_prefixes = ["/bin", "/sbin", "/usr", "/lib", "/lib64", "/nix/store"];
1718 for mount in &config.mounts {
1719 if mount.mount_type == "bind" {
1720 assert!(
1721 allowed_prefixes
1722 .iter()
1723 .any(|p| mount.destination.starts_with(p)),
1724 "unexpected bind mount destination: {} – only FHS paths allowed",
1725 mount.destination
1726 );
1727 }
1728 }
1729 }
1730
1731 #[test]
1732 fn test_volume_mounts_include_bind_and_tmpfs_options() {
1733 let tmp = tempfile::TempDir::new().unwrap();
1734 let config = OciConfig::new(vec!["/bin/sh".to_string()], None)
1735 .with_volume_mounts(&[
1736 crate::container::VolumeMount {
1737 source: crate::container::VolumeSource::Bind {
1738 source: tmp.path().to_path_buf(),
1739 },
1740 dest: std::path::PathBuf::from("/var/lib/app"),
1741 read_only: true,
1742 },
1743 crate::container::VolumeMount {
1744 source: crate::container::VolumeSource::Tmpfs {
1745 size: Some("64M".to_string()),
1746 },
1747 dest: std::path::PathBuf::from("/var/cache/app"),
1748 read_only: false,
1749 },
1750 ])
1751 .unwrap();
1752
1753 assert!(config.mounts.iter().any(|mount| {
1754 mount.destination == "/var/lib/app"
1755 && mount.mount_type == "bind"
1756 && mount.options.contains(&"ro".to_string())
1757 }));
1758 assert!(config.mounts.iter().any(|mount| {
1759 mount.destination == "/var/cache/app"
1760 && mount.mount_type == "tmpfs"
1761 && mount.options.contains(&"size=64M".to_string())
1762 }));
1763 }
1764
1765 #[test]
1766 fn test_volume_mounts_reject_sensitive_host_sources() {
1767 let err = OciConfig::new(vec!["/bin/sh".to_string()], None)
1768 .with_volume_mounts(&[crate::container::VolumeMount {
1769 source: crate::container::VolumeSource::Bind {
1770 source: std::path::PathBuf::from("/proc/sys"),
1771 },
1772 dest: std::path::PathBuf::from("/host-proc"),
1773 read_only: true,
1774 }])
1775 .unwrap_err();
1776
1777 assert!(err.to_string().contains("sensitive host path"));
1778 }
1779
1780 #[test]
1781 fn test_volume_mounts_reject_reserved_destinations() {
1782 let tmp = tempfile::TempDir::new().unwrap();
1783 let err = OciConfig::new(vec!["/bin/sh".to_string()], None)
1784 .with_volume_mounts(&[crate::container::VolumeMount {
1785 source: crate::container::VolumeSource::Bind {
1786 source: tmp.path().to_path_buf(),
1787 },
1788 dest: std::path::PathBuf::from("/usr/bin"),
1789 read_only: true,
1790 }])
1791 .unwrap_err();
1792
1793 assert!(err.to_string().contains("reserved"));
1794 }
1795
1796 #[test]
1797 fn test_oci_config_with_process_identity() {
1798 let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_process_identity(
1799 &crate::container::ProcessIdentity {
1800 uid: 1001,
1801 gid: 1002,
1802 additional_gids: vec![1003, 1004],
1803 },
1804 );
1805
1806 assert_eq!(config.process.user.uid, 1001);
1807 assert_eq!(config.process.user.gid, 1002);
1808 assert_eq!(config.process.user.additional_gids, Some(vec![1003, 1004]));
1809 }
1810
1811 #[test]
1812 fn test_oci_config_with_rlimits_uses_configured_memlock() {
1813 let limits = ResourceLimits::default()
1814 .with_pids(99)
1815 .unwrap()
1816 .with_memlock("8M")
1817 .unwrap();
1818
1819 let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_rlimits(&limits);
1820
1821 assert!(config.process.rlimits.iter().any(|limit| {
1822 limit.limit_type == "RLIMIT_NPROC" && limit.soft == 99 && limit.hard == 99
1823 }));
1824 assert!(config.process.rlimits.iter().any(|limit| {
1825 limit.limit_type == "RLIMIT_MEMLOCK"
1826 && limit.soft == 8 * 1024 * 1024
1827 && limit.hard == 8 * 1024 * 1024
1828 }));
1829 }
1830
1831 #[test]
1832 fn test_oci_config_with_rlimits_omits_nproc_when_unlimited() {
1833 let limits = ResourceLimits {
1834 pids_max: None,
1835 ..ResourceLimits::default()
1836 };
1837
1838 let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_rlimits(&limits);
1839
1840 assert!(
1841 !config
1842 .process
1843 .rlimits
1844 .iter()
1845 .any(|limit| limit.limit_type == "RLIMIT_NPROC"),
1846 "RLIMIT_NPROC must be omitted when pids_max is unlimited"
1847 );
1848 }
1849
1850 #[test]
1851 fn test_oci_config_uses_hardcoded_path_not_host() {
1852 std::env::set_var("PATH", "/nix/store/secret-hash/bin:/home/user/.local/bin");
1855 let config = OciConfig::new(vec!["/bin/sh".to_string()], None);
1856 let path_env = config
1857 .process
1858 .env
1859 .iter()
1860 .find(|e| e.starts_with("PATH="))
1861 .expect("PATH env must be set");
1862 assert_eq!(
1863 path_env, "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1864 "OCI config must not leak host PATH"
1865 );
1866 assert!(
1867 !path_env.contains("/nix/store/secret"),
1868 "Host PATH must not leak into container"
1869 );
1870 }
1871
1872 #[test]
1873 fn test_oci_hooks_serialization_roundtrip() {
1874 let hooks = OciHooks {
1875 create_runtime: vec![OciHook {
1876 path: "/usr/bin/hook1".to_string(),
1877 args: vec!["hook1".to_string(), "--arg1".to_string()],
1878 env: vec!["FOO=bar".to_string()],
1879 timeout: Some(10),
1880 }],
1881 create_container: vec![],
1882 start_container: vec![],
1883 poststart: vec![OciHook {
1884 path: "/usr/bin/hook2".to_string(),
1885 args: vec![],
1886 env: vec![],
1887 timeout: None,
1888 }],
1889 poststop: vec![],
1890 };
1891
1892 let json = serde_json::to_string_pretty(&hooks).unwrap();
1893 assert!(json.contains("createRuntime"));
1894 assert!(json.contains("/usr/bin/hook1"));
1895 assert!(!json.contains("createContainer")); let deserialized: OciHooks = serde_json::from_str(&json).unwrap();
1898 assert_eq!(deserialized.create_runtime.len(), 1);
1899 assert_eq!(deserialized.create_runtime[0].path, "/usr/bin/hook1");
1900 assert_eq!(deserialized.create_runtime[0].timeout, Some(10));
1901 assert_eq!(deserialized.poststart.len(), 1);
1902 assert!(deserialized.create_container.is_empty());
1903 }
1904
1905 #[test]
1906 fn test_oci_hooks_is_empty() {
1907 let empty = OciHooks::default();
1908 assert!(empty.is_empty());
1909
1910 let not_empty = OciHooks {
1911 poststop: vec![OciHook {
1912 path: "/bin/cleanup".to_string(),
1913 args: vec![],
1914 env: vec![],
1915 timeout: None,
1916 }],
1917 ..Default::default()
1918 };
1919 assert!(!not_empty.is_empty());
1920 }
1921
1922 #[test]
1923 fn test_oci_config_with_hooks() {
1924 let hooks = OciHooks {
1925 create_runtime: vec![OciHook {
1926 path: "/usr/bin/setup".to_string(),
1927 args: vec![],
1928 env: vec![],
1929 timeout: None,
1930 }],
1931 ..Default::default()
1932 };
1933
1934 let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_hooks(hooks);
1935 assert!(config.hooks.is_some());
1936
1937 let json = serde_json::to_string_pretty(&config).unwrap();
1938 assert!(json.contains("hooks"));
1939 assert!(json.contains("createRuntime"));
1940
1941 let deserialized: OciConfig = serde_json::from_str(&json).unwrap();
1942 assert!(deserialized.hooks.is_some());
1943 assert_eq!(deserialized.hooks.unwrap().create_runtime.len(), 1);
1944 }
1945
1946 #[test]
1947 fn test_oci_config_with_empty_hooks_serializes_without_hooks() {
1948 let config =
1949 OciConfig::new(vec!["/bin/sh".to_string()], None).with_hooks(OciHooks::default());
1950 assert!(config.hooks.is_none()); let json = serde_json::to_string_pretty(&config).unwrap();
1953 assert!(!json.contains("hooks"));
1954 }
1955
1956 #[test]
1957 fn test_oci_hook_rejects_relative_path() {
1958 let hook = OciHook {
1959 path: "relative/path".to_string(),
1960 args: vec![],
1961 env: vec![],
1962 timeout: None,
1963 };
1964 let state = OciContainerState {
1965 oci_version: "1.0.2".to_string(),
1966 id: "test".to_string(),
1967 status: OciStatus::Creating,
1968 pid: 1234,
1969 bundle: "/tmp/bundle".to_string(),
1970 };
1971 let result = OciHooks::run_hooks(&[hook], &state, "test");
1972 assert!(result.is_err());
1973 let err_msg = result.unwrap_err().to_string();
1974 assert!(err_msg.contains("absolute"), "error: {}", err_msg);
1975 }
1976
1977 fn original_path() -> String {
1983 if let Ok(environ) = std::fs::read("/proc/self/environ") {
1984 for entry in environ.split(|&b| b == 0) {
1985 if let Ok(s) = std::str::from_utf8(entry) {
1986 if let Some(val) = s.strip_prefix("PATH=") {
1987 return val.to_string();
1988 }
1989 }
1990 }
1991 }
1992 String::new()
1993 }
1994
1995 fn find_bash() -> String {
1997 let candidates = ["/bin/bash", "/usr/bin/bash"];
1998 for c in &candidates {
1999 if std::path::Path::new(c).exists() {
2000 return c.to_string();
2001 }
2002 }
2003 for dir in original_path().split(':') {
2004 let candidate = std::path::PathBuf::from(dir).join("bash");
2005 if candidate.exists() {
2006 return candidate.to_string_lossy().to_string();
2007 }
2008 }
2009 panic!("Cannot find bash binary for test");
2010 }
2011
2012 fn write_script(path: &std::path::Path, body: &str) {
2016 use std::io::Write as IoWrite;
2017 let bash = find_bash();
2018 let orig_path = original_path();
2019 let content = format!("#!{}\nexport PATH='{}'\n{}", bash, orig_path, body);
2020 let mut f = OpenOptions::new()
2021 .create(true)
2022 .truncate(true)
2023 .write(true)
2024 .mode(0o755)
2025 .open(path)
2026 .unwrap();
2027 f.write_all(content.as_bytes()).unwrap();
2028 f.sync_all().unwrap();
2029 drop(f);
2030 }
2031
2032 #[test]
2033 fn test_oci_hook_executes_successfully() {
2034 let temp_dir = TempDir::new().unwrap();
2035 let hook_script = temp_dir.path().join("hook.sh");
2036 let output_file = temp_dir.path().join("output.json");
2037
2038 write_script(
2039 &hook_script,
2040 &format!("cat > {}\n", output_file.to_string_lossy()),
2041 );
2042
2043 let hook = OciHook {
2044 path: hook_script.to_string_lossy().to_string(),
2045 args: vec![],
2046 env: vec![],
2047 timeout: Some(5),
2048 };
2049 let state = OciContainerState {
2050 oci_version: "1.0.2".to_string(),
2051 id: "test-container".to_string(),
2052 status: OciStatus::Creating,
2053 pid: 12345,
2054 bundle: "/tmp/test-bundle".to_string(),
2055 };
2056
2057 OciHooks::run_hooks(&[hook], &state, "createRuntime").unwrap();
2058
2059 let written = std::fs::read_to_string(&output_file).unwrap();
2061 let parsed: serde_json::Value = serde_json::from_str(&written).unwrap();
2062 assert_eq!(parsed["id"], "test-container");
2063 assert_eq!(parsed["pid"], 12345);
2064 assert_eq!(parsed["status"], "creating");
2065 }
2066
2067 #[test]
2068 fn test_oci_hook_retries_text_file_busy_spawn() {
2069 let temp_dir = TempDir::new().unwrap();
2070 let hook_script = temp_dir.path().join("hook.sh");
2071 let output_file = temp_dir.path().join("output.json");
2072
2073 write_script(
2074 &hook_script,
2075 &format!("cat > {}\n", output_file.to_string_lossy()),
2076 );
2077
2078 let (ready_tx, ready_rx) = std::sync::mpsc::channel();
2079 let busy_script = hook_script.clone();
2080 let busy_handle = std::thread::spawn(move || {
2081 let _busy_file = OpenOptions::new().write(true).open(&busy_script).unwrap();
2082 ready_tx.send(()).unwrap();
2083 std::thread::sleep(std::time::Duration::from_millis(100));
2084 });
2085 ready_rx.recv().unwrap();
2086
2087 let hook = OciHook {
2088 path: hook_script.to_string_lossy().to_string(),
2089 args: vec![],
2090 env: vec![],
2091 timeout: Some(5),
2092 };
2093 let state = OciContainerState {
2094 oci_version: "1.0.2".to_string(),
2095 id: "test-container".to_string(),
2096 status: OciStatus::Creating,
2097 pid: 12345,
2098 bundle: "/tmp/test-bundle".to_string(),
2099 };
2100
2101 let result = OciHooks::run_hooks(&[hook], &state, "createRuntime");
2102 busy_handle.join().unwrap();
2103 result.unwrap();
2104
2105 let written = std::fs::read_to_string(&output_file).unwrap();
2106 let parsed: serde_json::Value = serde_json::from_str(&written).unwrap();
2107 assert_eq!(parsed["id"], "test-container");
2108 }
2109
2110 #[test]
2111 fn test_oci_hook_nonzero_exit_is_error() {
2112 let temp_dir = TempDir::new().unwrap();
2113 let hook_script = temp_dir.path().join("fail.sh");
2114 write_script(&hook_script, "exit 1\n");
2115
2116 let hook = OciHook {
2117 path: hook_script.to_string_lossy().to_string(),
2118 args: vec![],
2119 env: vec![],
2120 timeout: Some(5),
2121 };
2122 let state = OciContainerState {
2123 oci_version: "1.0.2".to_string(),
2124 id: "test".to_string(),
2125 status: OciStatus::Creating,
2126 pid: 1,
2127 bundle: "".to_string(),
2128 };
2129
2130 let result = OciHooks::run_hooks(&[hook], &state, "test");
2131 assert!(result.is_err());
2132 assert!(result
2133 .unwrap_err()
2134 .to_string()
2135 .contains("exited with status"));
2136 }
2137
2138 #[test]
2139 fn test_oci_hooks_best_effort_continues_on_failure() {
2140 let temp_dir = TempDir::new().unwrap();
2141 let fail_script = temp_dir.path().join("fail.sh");
2142 write_script(&fail_script, "exit 1\n");
2143
2144 let marker = temp_dir.path().join("ran");
2145 let ok_script = temp_dir.path().join("ok.sh");
2146 write_script(&ok_script, &format!("touch {}\n", marker.to_string_lossy()));
2147
2148 let hooks = vec![
2149 OciHook {
2150 path: fail_script.to_string_lossy().to_string(),
2151 args: vec![],
2152 env: vec![],
2153 timeout: Some(5),
2154 },
2155 OciHook {
2156 path: ok_script.to_string_lossy().to_string(),
2157 args: vec![],
2158 env: vec![],
2159 timeout: Some(5),
2160 },
2161 ];
2162 let state = OciContainerState {
2163 oci_version: "1.0.2".to_string(),
2164 id: "test".to_string(),
2165 status: OciStatus::Stopped,
2166 pid: 0,
2167 bundle: "".to_string(),
2168 };
2169
2170 OciHooks::run_hooks_best_effort(&hooks, &state, "poststop");
2172 assert!(marker.exists(), "second hook should run after first fails");
2174 }
2175}