Skip to main content

nucleus/oci/
mod.rs

1use crate::container::OciStatus;
2use crate::error::{NucleusError, Result};
3use crate::filesystem::normalize_container_destination;
4use crate::isolation::{IdMapping, NamespaceConfig, UserNamespaceConfig};
5use crate::resources::ResourceLimits;
6use serde::{Deserialize, Serialize};
7use std::collections::{BTreeSet, HashMap};
8use std::fs;
9use std::fs::OpenOptions;
10use std::io::Write;
11use std::os::unix::fs::{OpenOptionsExt, PermissionsExt};
12use std::path::{Path, PathBuf};
13use tracing::{debug, info, warn};
14
15/// OCI Runtime Specification configuration
16///
17/// This implements a subset of the OCI runtime spec for gVisor compatibility
18/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config.md>
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct OciConfig {
21    #[serde(rename = "ociVersion")]
22    pub oci_version: String,
23
24    pub root: OciRoot,
25    pub process: OciProcess,
26    pub hostname: Option<String>,
27    pub mounts: Vec<OciMount>,
28    pub linux: Option<OciLinux>,
29    #[serde(default, skip_serializing_if = "Option::is_none")]
30    pub hooks: Option<OciHooks>,
31    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
32    pub annotations: HashMap<String, String>,
33}
34
35#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct OciRoot {
37    pub path: String,
38    pub readonly: bool,
39}
40
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct OciProcess {
43    pub terminal: bool,
44    pub user: OciUser,
45    pub args: Vec<String>,
46    pub env: Vec<String>,
47    pub cwd: String,
48    #[serde(rename = "noNewPrivileges")]
49    pub no_new_privileges: bool,
50    pub capabilities: Option<OciCapabilities>,
51    #[serde(default, skip_serializing_if = "Vec::is_empty")]
52    pub rlimits: Vec<OciRlimit>,
53    #[serde(
54        rename = "consoleSize",
55        default,
56        skip_serializing_if = "Option::is_none"
57    )]
58    pub console_size: Option<OciConsoleSize>,
59    #[serde(
60        rename = "apparmorProfile",
61        default,
62        skip_serializing_if = "Option::is_none"
63    )]
64    pub apparmor_profile: Option<String>,
65    #[serde(
66        rename = "selinuxLabel",
67        default,
68        skip_serializing_if = "Option::is_none"
69    )]
70    pub selinux_label: Option<String>,
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize)]
74pub struct OciUser {
75    pub uid: u32,
76    pub gid: u32,
77    #[serde(skip_serializing_if = "Option::is_none")]
78    pub additional_gids: Option<Vec<u32>>,
79}
80
81#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct OciCapabilities {
83    pub bounding: Vec<String>,
84    pub effective: Vec<String>,
85    pub inheritable: Vec<String>,
86    pub permitted: Vec<String>,
87    pub ambient: Vec<String>,
88}
89
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct OciMount {
92    pub destination: String,
93    pub source: String,
94    #[serde(rename = "type")]
95    pub mount_type: String,
96    pub options: Vec<String>,
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
100pub struct OciLinux {
101    #[serde(skip_serializing_if = "Option::is_none")]
102    pub namespaces: Option<Vec<OciNamespace>>,
103    #[serde(skip_serializing_if = "Option::is_none")]
104    pub resources: Option<OciResources>,
105    #[serde(rename = "uidMappings", skip_serializing_if = "Vec::is_empty", default)]
106    pub uid_mappings: Vec<OciIdMapping>,
107    #[serde(rename = "gidMappings", skip_serializing_if = "Vec::is_empty", default)]
108    pub gid_mappings: Vec<OciIdMapping>,
109    #[serde(rename = "maskedPaths", skip_serializing_if = "Vec::is_empty", default)]
110    pub masked_paths: Vec<String>,
111    #[serde(
112        rename = "readonlyPaths",
113        skip_serializing_if = "Vec::is_empty",
114        default
115    )]
116    pub readonly_paths: Vec<String>,
117    #[serde(default, skip_serializing_if = "Vec::is_empty")]
118    pub devices: Vec<OciDevice>,
119    #[serde(default, skip_serializing_if = "Option::is_none")]
120    pub seccomp: Option<OciSeccomp>,
121    #[serde(
122        rename = "rootfsPropagation",
123        default,
124        skip_serializing_if = "Option::is_none"
125    )]
126    pub rootfs_propagation: Option<String>,
127    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
128    pub sysctl: HashMap<String, String>,
129    #[serde(
130        rename = "cgroupsPath",
131        default,
132        skip_serializing_if = "Option::is_none"
133    )]
134    pub cgroups_path: Option<String>,
135    #[serde(rename = "intelRdt", default, skip_serializing_if = "Option::is_none")]
136    pub intel_rdt: Option<OciIntelRdt>,
137}
138
139#[derive(Debug, Clone, Serialize, Deserialize)]
140pub struct OciNamespace {
141    #[serde(rename = "type")]
142    pub namespace_type: String,
143}
144
145#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
146pub struct OciIdMapping {
147    #[serde(rename = "containerID")]
148    pub container_id: u32,
149    #[serde(rename = "hostID")]
150    pub host_id: u32,
151    pub size: u32,
152}
153
154#[derive(Debug, Clone, Serialize, Deserialize)]
155pub struct OciResources {
156    #[serde(skip_serializing_if = "Option::is_none")]
157    pub memory: Option<OciMemory>,
158    #[serde(skip_serializing_if = "Option::is_none")]
159    pub cpu: Option<OciCpu>,
160    #[serde(skip_serializing_if = "Option::is_none")]
161    pub pids: Option<OciPids>,
162}
163
164#[derive(Debug, Clone, Serialize, Deserialize)]
165pub struct OciMemory {
166    #[serde(skip_serializing_if = "Option::is_none")]
167    pub limit: Option<i64>,
168}
169
170#[derive(Debug, Clone, Serialize, Deserialize)]
171pub struct OciCpu {
172    #[serde(skip_serializing_if = "Option::is_none")]
173    pub quota: Option<i64>,
174    #[serde(skip_serializing_if = "Option::is_none")]
175    pub period: Option<u64>,
176}
177
178#[derive(Debug, Clone, Serialize, Deserialize)]
179pub struct OciPids {
180    pub limit: i64,
181}
182
183/// OCI process resource limit.
184///
185/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config.md#posix-process>
186#[derive(Debug, Clone, Serialize, Deserialize)]
187pub struct OciRlimit {
188    /// Resource type (e.g. "RLIMIT_NOFILE", "RLIMIT_NPROC")
189    #[serde(rename = "type")]
190    pub limit_type: String,
191    /// Hard limit
192    pub hard: u64,
193    /// Soft limit
194    pub soft: u64,
195}
196
197/// OCI console size for terminal-attached processes.
198#[derive(Debug, Clone, Serialize, Deserialize)]
199pub struct OciConsoleSize {
200    pub height: u32,
201    pub width: u32,
202}
203
204/// OCI linux device entry.
205///
206/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#devices>
207#[derive(Debug, Clone, Serialize, Deserialize)]
208pub struct OciDevice {
209    /// Device type: "c" (char), "b" (block), "u" (unbuffered), "p" (FIFO)
210    #[serde(rename = "type")]
211    pub device_type: String,
212    /// Device path inside the container
213    pub path: String,
214    /// Major number
215    #[serde(skip_serializing_if = "Option::is_none")]
216    pub major: Option<i64>,
217    /// Minor number
218    #[serde(skip_serializing_if = "Option::is_none")]
219    pub minor: Option<i64>,
220    /// File mode (permissions)
221    #[serde(rename = "fileMode", skip_serializing_if = "Option::is_none")]
222    pub file_mode: Option<u32>,
223    /// UID of the device owner
224    #[serde(skip_serializing_if = "Option::is_none")]
225    pub uid: Option<u32>,
226    /// GID of the device owner
227    #[serde(skip_serializing_if = "Option::is_none")]
228    pub gid: Option<u32>,
229}
230
231/// OCI seccomp configuration.
232///
233/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#seccomp>
234#[derive(Debug, Clone, Serialize, Deserialize)]
235pub struct OciSeccomp {
236    /// Default action when no rule matches (e.g. "SCMP_ACT_ERRNO", "SCMP_ACT_ALLOW")
237    #[serde(rename = "defaultAction")]
238    pub default_action: String,
239    /// Target architectures
240    #[serde(default, skip_serializing_if = "Vec::is_empty")]
241    pub architectures: Vec<String>,
242    /// Syscall rules
243    #[serde(default, skip_serializing_if = "Vec::is_empty")]
244    pub syscalls: Vec<OciSeccompSyscall>,
245}
246
247/// A single seccomp syscall rule.
248#[derive(Debug, Clone, Serialize, Deserialize)]
249pub struct OciSeccompSyscall {
250    /// Syscall names this rule applies to
251    pub names: Vec<String>,
252    /// Action to take (e.g. "SCMP_ACT_ALLOW")
253    pub action: String,
254    /// Optional argument conditions
255    #[serde(default, skip_serializing_if = "Vec::is_empty")]
256    pub args: Vec<OciSeccompArg>,
257}
258
259/// Seccomp syscall argument filter.
260#[derive(Debug, Clone, Serialize, Deserialize)]
261pub struct OciSeccompArg {
262    /// Argument index (0-based)
263    pub index: u32,
264    /// Value to compare against
265    pub value: u64,
266    /// Second value for masked operations
267    #[serde(rename = "valueTwo", default, skip_serializing_if = "is_zero")]
268    pub value_two: u64,
269    /// Comparison operator (e.g. "SCMP_CMP_EQ", "SCMP_CMP_MASKED_EQ")
270    pub op: String,
271}
272
273fn is_zero(v: &u64) -> bool {
274    *v == 0
275}
276
277/// OCI Intel RDT (Resource Director Technology) configuration.
278///
279/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#intel-rdt>
280#[derive(Debug, Clone, Serialize, Deserialize)]
281pub struct OciIntelRdt {
282    /// Unique identity for the container's cache and memory bandwidth allocation
283    #[serde(rename = "closID", default, skip_serializing_if = "Option::is_none")]
284    pub clos_id: Option<String>,
285    /// Schema for L3 cache allocation
286    #[serde(
287        rename = "l3CacheSchema",
288        default,
289        skip_serializing_if = "Option::is_none"
290    )]
291    pub l3_cache_schema: Option<String>,
292    /// Schema for memory bandwidth allocation
293    #[serde(
294        rename = "memBwSchema",
295        default,
296        skip_serializing_if = "Option::is_none"
297    )]
298    pub mem_bw_schema: Option<String>,
299}
300
301/// A single OCI lifecycle hook entry.
302///
303/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config.md#posix-platform-hooks>
304#[derive(Debug, Clone, Serialize, Deserialize)]
305pub struct OciHook {
306    /// Absolute path to the hook binary.
307    pub path: String,
308    /// Arguments passed to the hook (argv\[0\] should be the binary name).
309    #[serde(default, skip_serializing_if = "Vec::is_empty")]
310    pub args: Vec<String>,
311    /// Environment variables for the hook process.
312    #[serde(default, skip_serializing_if = "Vec::is_empty")]
313    pub env: Vec<String>,
314    /// Timeout in seconds. If the hook does not exit within this duration it is killed.
315    #[serde(default, skip_serializing_if = "Option::is_none")]
316    pub timeout: Option<u32>,
317}
318
319/// OCI lifecycle hooks.
320///
321/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config.md#posix-platform-hooks>
322#[derive(Debug, Clone, Default, Serialize, Deserialize)]
323pub struct OciHooks {
324    /// Called after the runtime environment has been created but before pivot_root.
325    #[serde(
326        rename = "createRuntime",
327        default,
328        skip_serializing_if = "Vec::is_empty"
329    )]
330    pub create_runtime: Vec<OciHook>,
331    /// Called after pivot_root but before the start operation.
332    #[serde(
333        rename = "createContainer",
334        default,
335        skip_serializing_if = "Vec::is_empty"
336    )]
337    pub create_container: Vec<OciHook>,
338    /// Called after the start operation but before the user process executes.
339    #[serde(
340        rename = "startContainer",
341        default,
342        skip_serializing_if = "Vec::is_empty"
343    )]
344    pub start_container: Vec<OciHook>,
345    /// Called after the user-specified process has started.
346    #[serde(default, skip_serializing_if = "Vec::is_empty")]
347    pub poststart: Vec<OciHook>,
348    /// Called after the container has been stopped.
349    #[serde(default, skip_serializing_if = "Vec::is_empty")]
350    pub poststop: Vec<OciHook>,
351}
352
353/// Container state JSON passed to OCI hooks on stdin.
354///
355/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/runtime.md#state>
356#[derive(Debug, Clone, Serialize)]
357pub struct OciContainerState {
358    #[serde(rename = "ociVersion")]
359    pub oci_version: String,
360    pub id: String,
361    pub status: OciStatus,
362    pub pid: u32,
363    pub bundle: String,
364}
365
366impl OciHooks {
367    /// Returns true if there are no hooks configured.
368    pub fn is_empty(&self) -> bool {
369        self.create_runtime.is_empty()
370            && self.create_container.is_empty()
371            && self.start_container.is_empty()
372            && self.poststart.is_empty()
373            && self.poststop.is_empty()
374    }
375
376    /// Execute a list of hooks in order, passing container state JSON on stdin.
377    ///
378    /// If any hook exits non-zero, an error is returned immediately (remaining hooks are skipped).
379    pub fn run_hooks(hooks: &[OciHook], state: &OciContainerState, phase: &str) -> Result<()> {
380        let state_json = serde_json::to_string(state).map_err(|e| {
381            NucleusError::HookError(format!(
382                "Failed to serialize container state for hook: {}",
383                e
384            ))
385        })?;
386
387        for (i, hook) in hooks.iter().enumerate() {
388            info!(
389                "Running {} hook [{}/{}]: {}",
390                phase,
391                i + 1,
392                hooks.len(),
393                hook.path
394            );
395            Self::execute_hook(hook, &state_json, phase)?;
396        }
397
398        Ok(())
399    }
400
401    /// Execute a list of hooks best-effort (log errors but don't fail).
402    ///
403    /// Used for poststop hooks per the OCI spec: errors MUST be logged but MUST NOT
404    /// prevent cleanup.
405    pub fn run_hooks_best_effort(hooks: &[OciHook], state: &OciContainerState, phase: &str) {
406        let state_json = match serde_json::to_string(state) {
407            Ok(json) => json,
408            Err(e) => {
409                warn!(
410                    "Failed to serialize container state for {} hooks: {}",
411                    phase, e
412                );
413                return;
414            }
415        };
416
417        for (i, hook) in hooks.iter().enumerate() {
418            info!(
419                "Running {} hook [{}/{}]: {}",
420                phase,
421                i + 1,
422                hooks.len(),
423                hook.path
424            );
425            if let Err(e) = Self::execute_hook(hook, &state_json, phase) {
426                warn!("{} hook [{}] failed (continuing): {}", phase, i + 1, e);
427            }
428        }
429    }
430
431    fn execute_hook(hook: &OciHook, state_json: &str, phase: &str) -> Result<()> {
432        #[cfg(not(test))]
433        use std::os::unix::process::CommandExt;
434        use std::process::{Command, Stdio};
435
436        let hook_path = Path::new(&hook.path);
437        if !hook_path.is_absolute() {
438            return Err(NucleusError::HookError(format!(
439                "{} hook path must be absolute: {}",
440                phase, hook.path
441            )));
442        }
443
444        // Restrict hooks to trusted system directories. Hooks execute in
445        // the parent process before security hardening (by OCI spec), so
446        // they must come from locations that unprivileged users cannot write to.
447        #[cfg(not(test))]
448        {
449            const TRUSTED_HOOK_PREFIXES: &[&str] = &[
450                "/usr/bin/",
451                "/usr/sbin/",
452                "/usr/lib/",
453                "/usr/libexec/",
454                "/usr/local/bin/",
455                "/usr/local/sbin/",
456                "/usr/local/libexec/",
457                "/bin/",
458                "/sbin/",
459                "/nix/store/",
460                "/opt/",
461            ];
462            if !TRUSTED_HOOK_PREFIXES
463                .iter()
464                .any(|prefix| hook.path.starts_with(prefix))
465            {
466                return Err(NucleusError::HookError(format!(
467                    "{} hook path '{}' is not under a trusted directory ({:?})",
468                    phase, hook.path, TRUSTED_HOOK_PREFIXES
469                )));
470            }
471        }
472
473        // Use symlink_metadata (lstat) instead of .exists() to avoid
474        // following symlinks in the existence check. Reject symlinked hooks
475        // to prevent a TOCTOU swap between the check and exec.
476        match std::fs::symlink_metadata(hook_path) {
477            Ok(meta) if meta.file_type().is_symlink() => {
478                return Err(NucleusError::HookError(format!(
479                    "{} hook path is a symlink (refusing to follow): {}",
480                    phase, hook.path
481                )));
482            }
483            Err(_) => {
484                return Err(NucleusError::HookError(format!(
485                    "{} hook binary not found: {}",
486                    phase, hook.path
487                )));
488            }
489            Ok(_) => {}
490        }
491
492        // C-1: Validate hook binary ownership and permissions to prevent
493        // execution of world-writable or unexpectedly-owned binaries.
494        // Similar to runsc's hook validation – reject hooks that could be
495        // tampered with by unprivileged users.
496        Self::validate_hook_binary(hook_path, phase)?;
497
498        let mut cmd = Command::new(&hook.path);
499        if !hook.args.is_empty() {
500            // OCI spec: args[0] is the binary name (like execve argv); pass rest as arguments
501            cmd.args(&hook.args[1..]);
502        }
503
504        if !hook.env.is_empty() {
505            cmd.env_clear();
506            for entry in &hook.env {
507                if let Some((key, value)) = entry.split_once('=') {
508                    cmd.env(key, value);
509                }
510            }
511        }
512
513        // C-1: Drop all capabilities and set restrictive resource limits
514        // for hook execution. Hooks run in the parent process before security
515        // hardening, so we sandbox them defensively.
516        cmd.stdin(Stdio::piped());
517        cmd.stdout(Stdio::piped());
518        cmd.stderr(Stdio::piped());
519
520        // C-1: Apply RLIMIT backstops only in the spawned child process
521        // via pre_exec, so the parent process is not affected.
522        // Note: pre_exec runs after fork but before exec, in the child process.
523        #[cfg(not(test))]
524        unsafe {
525            cmd.pre_exec(|| {
526                // Prevent the hook from gaining privileges via setuid/setgid
527                // binaries or file capabilities. This must be set before exec.
528                if libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0 {
529                    return Err(std::io::Error::last_os_error());
530                }
531
532                let rlim_nproc = libc::rlimit {
533                    rlim_cur: 1024,
534                    rlim_max: 1024,
535                };
536                if libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc) != 0 {
537                    return Err(std::io::Error::last_os_error());
538                }
539
540                let rlim_nofile = libc::rlimit {
541                    rlim_cur: 1024,
542                    rlim_max: 1024,
543                };
544                if libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile) != 0 {
545                    return Err(std::io::Error::last_os_error());
546                }
547
548                Ok(())
549            });
550        }
551
552        const TEXT_FILE_BUSY_SPAWN_RETRIES: usize = 100;
553        const TEXT_FILE_BUSY_RETRY_DELAY: std::time::Duration =
554            std::time::Duration::from_millis(10);
555
556        let mut text_file_busy_retries = 0;
557        let mut child = loop {
558            match cmd.spawn() {
559                Ok(child) => break child,
560                Err(e)
561                    if e.raw_os_error() == Some(libc::ETXTBSY)
562                        && text_file_busy_retries < TEXT_FILE_BUSY_SPAWN_RETRIES =>
563                {
564                    text_file_busy_retries += 1;
565                    debug!(
566                        "{} hook {} was busy during spawn; retrying ({}/{})",
567                        phase, hook.path, text_file_busy_retries, TEXT_FILE_BUSY_SPAWN_RETRIES
568                    );
569                    std::thread::sleep(TEXT_FILE_BUSY_RETRY_DELAY);
570                }
571                Err(e) => {
572                    return Err(NucleusError::HookError(format!(
573                        "Failed to spawn {} hook {}: {}",
574                        phase, hook.path, e
575                    )));
576                }
577            }
578        };
579
580        if let Some(mut stdin) = child.stdin.take() {
581            use std::io::Write as IoWrite;
582            let _ = stdin.write_all(state_json.as_bytes());
583        }
584
585        let timeout_secs = hook.timeout.unwrap_or(30) as u64;
586        let start = std::time::Instant::now();
587        let timeout = std::time::Duration::from_secs(timeout_secs);
588
589        loop {
590            match child.try_wait() {
591                Ok(Some(status)) => {
592                    if status.success() {
593                        debug!("{} hook {} completed successfully", phase, hook.path);
594                        return Ok(());
595                    } else {
596                        let stderr = child
597                            .stderr
598                            .take()
599                            .map(|mut e| {
600                                let mut buf = String::new();
601                                use std::io::Read;
602                                let _ = e.read_to_string(&mut buf);
603                                buf
604                            })
605                            .unwrap_or_default();
606                        return Err(NucleusError::HookError(format!(
607                            "{} hook {} exited with status: {}{}",
608                            phase,
609                            hook.path,
610                            status,
611                            if stderr.is_empty() {
612                                String::new()
613                            } else {
614                                format!(" (stderr: {})", stderr.trim())
615                            }
616                        )));
617                    }
618                }
619                Ok(None) => {
620                    if start.elapsed() >= timeout {
621                        let _ = child.kill();
622                        let _ = child.wait();
623                        return Err(NucleusError::HookError(format!(
624                            "{} hook {} timed out after {}s",
625                            phase, hook.path, timeout_secs
626                        )));
627                    }
628                    std::thread::sleep(std::time::Duration::from_millis(50));
629                }
630                Err(e) => {
631                    return Err(NucleusError::HookError(format!(
632                        "Failed to wait for {} hook {}: {}",
633                        phase, hook.path, e
634                    )));
635                }
636            }
637        }
638    }
639
640    /// Validate hook binary ownership and permissions.
641    ///
642    /// Rejects hooks that are world-writable or group-writable, or owned by
643    /// a UID that doesn't match the effective UID or root. This prevents
644    /// privilege escalation via tampered hook binaries.
645    fn validate_hook_binary(hook_path: &Path, phase: &str) -> Result<()> {
646        // Use symlink_metadata (lstat) to inspect the hook path itself
647        // rather than following symlinks, consistent with the rejection
648        // of symlinked hooks above.
649        let metadata = std::fs::symlink_metadata(hook_path).map_err(|e| {
650            NucleusError::HookError(format!(
651                "Failed to stat {} hook {}: {}",
652                phase,
653                hook_path.display(),
654                e
655            ))
656        })?;
657
658        use std::os::unix::fs::MetadataExt;
659        let mode = metadata.mode();
660        let uid = metadata.uid();
661        let gid = metadata.gid();
662        let effective_uid = nix::unistd::Uid::effective().as_raw();
663
664        // Reject world-writable hooks
665        if mode & 0o002 != 0 {
666            return Err(NucleusError::HookError(format!(
667                "{} hook {} is world-writable (mode {:04o}) – refusing to execute",
668                phase,
669                hook_path.display(),
670                mode & 0o7777
671            )));
672        }
673
674        // Reject group-writable hooks unless owned by root
675        if mode & 0o020 != 0 && uid != 0 {
676            return Err(NucleusError::HookError(format!(
677                "{} hook {} is group-writable and not owned by root (mode {:04o}, uid {}) – refusing to execute",
678                phase,
679                hook_path.display(),
680                mode & 0o7777,
681                uid
682            )));
683        }
684
685        // Reject hooks owned by arbitrary UIDs – must be root or effective UID
686        if uid != 0 && uid != effective_uid {
687            return Err(NucleusError::HookError(format!(
688                "{} hook {} is owned by UID {} (expected 0 or {}) – refusing to execute",
689                phase,
690                hook_path.display(),
691                uid,
692                effective_uid
693            )));
694        }
695
696        // Reject hooks with setuid/setgid bits
697        if mode & 0o6000 != 0 {
698            return Err(NucleusError::HookError(format!(
699                "{} hook {} has setuid/setgid bits (mode {:04o}) – refusing to execute",
700                phase,
701                hook_path.display(),
702                mode & 0o7777
703            )));
704        }
705
706        debug!(
707            "{} hook {} validation passed (uid={}, gid={}, mode={:04o})",
708            phase,
709            hook_path.display(),
710            uid,
711            gid,
712            mode & 0o7777
713        );
714
715        Ok(())
716    }
717}
718
719impl OciConfig {
720    /// Create a minimal OCI config for Nucleus containers
721    pub fn new(command: Vec<String>, hostname: Option<String>) -> Self {
722        Self {
723            oci_version: "1.0.2".to_string(),
724            root: OciRoot {
725                path: "rootfs".to_string(),
726                readonly: true,
727            },
728            process: OciProcess {
729                terminal: false,
730                user: OciUser {
731                    uid: 0,
732                    gid: 0,
733                    additional_gids: None,
734                },
735                args: command,
736                env: vec![
737                    "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(),
738                ],
739                cwd: "/".to_string(),
740                no_new_privileges: true,
741                capabilities: Some(OciCapabilities {
742                    bounding: vec![],
743                    effective: vec![],
744                    inheritable: vec![],
745                    permitted: vec![],
746                    ambient: vec![],
747                }),
748                rlimits: vec![],
749                console_size: None,
750                apparmor_profile: None,
751                selinux_label: None,
752            },
753            hostname,
754            mounts: vec![
755                OciMount {
756                    destination: "/proc".to_string(),
757                    source: "proc".to_string(),
758                    mount_type: "proc".to_string(),
759                    options: vec![
760                        "nosuid".to_string(),
761                        "noexec".to_string(),
762                        "nodev".to_string(),
763                    ],
764                },
765                OciMount {
766                    destination: "/dev".to_string(),
767                    source: "tmpfs".to_string(),
768                    mount_type: "tmpfs".to_string(),
769                    options: vec![
770                        "nosuid".to_string(),
771                        "noexec".to_string(),
772                        "strictatime".to_string(),
773                        "mode=755".to_string(),
774                        "size=65536k".to_string(),
775                    ],
776                },
777                OciMount {
778                    destination: "/dev/shm".to_string(),
779                    source: "shm".to_string(),
780                    mount_type: "tmpfs".to_string(),
781                    options: vec![
782                        "nosuid".to_string(),
783                        "noexec".to_string(),
784                        "nodev".to_string(),
785                        "mode=1777".to_string(),
786                        "size=65536k".to_string(),
787                    ],
788                },
789                OciMount {
790                    destination: "/tmp".to_string(),
791                    source: "tmpfs".to_string(),
792                    mount_type: "tmpfs".to_string(),
793                    options: vec![
794                        "nosuid".to_string(),
795                        "nodev".to_string(),
796                        "noexec".to_string(),
797                        "mode=1777".to_string(),
798                        "size=65536k".to_string(),
799                    ],
800                },
801                OciMount {
802                    destination: "/sys".to_string(),
803                    source: "sysfs".to_string(),
804                    mount_type: "sysfs".to_string(),
805                    options: vec![
806                        "nosuid".to_string(),
807                        "noexec".to_string(),
808                        "nodev".to_string(),
809                        "ro".to_string(),
810                    ],
811                },
812            ],
813            hooks: None,
814            annotations: HashMap::new(),
815            linux: Some(OciLinux {
816                namespaces: Some(vec![
817                    OciNamespace {
818                        namespace_type: "pid".to_string(),
819                    },
820                    OciNamespace {
821                        namespace_type: "network".to_string(),
822                    },
823                    OciNamespace {
824                        namespace_type: "ipc".to_string(),
825                    },
826                    OciNamespace {
827                        namespace_type: "uts".to_string(),
828                    },
829                    OciNamespace {
830                        namespace_type: "mount".to_string(),
831                    },
832                ]),
833                resources: None,
834                uid_mappings: vec![],
835                gid_mappings: vec![],
836                // M14: Aligned with native masked paths in mount.rs (PROC_NULL_MASKED)
837                masked_paths: vec![
838                    "/proc/acpi".to_string(),
839                    "/proc/asound".to_string(),
840                    "/proc/kcore".to_string(),
841                    "/proc/keys".to_string(),
842                    "/proc/latency_stats".to_string(),
843                    "/proc/sched_debug".to_string(),
844                    "/proc/scsi".to_string(),
845                    "/proc/timer_list".to_string(),
846                    "/proc/timer_stats".to_string(),
847                    "/proc/sysrq-trigger".to_string(), // M14: null-mask, not read-only
848                    "/proc/kpagecount".to_string(),
849                    "/proc/kpageflags".to_string(),
850                    "/proc/kpagecgroup".to_string(),
851                    "/proc/config.gz".to_string(),
852                    "/proc/kallsyms".to_string(),
853                    "/sys/firmware".to_string(),
854                ],
855                readonly_paths: vec![
856                    "/proc/bus".to_string(),
857                    "/proc/fs".to_string(),
858                    "/proc/irq".to_string(),
859                    "/proc/sys".to_string(),
860                ],
861                devices: vec![
862                    OciDevice {
863                        device_type: "c".to_string(),
864                        path: "/dev/null".to_string(),
865                        major: Some(1),
866                        minor: Some(3),
867                        file_mode: Some(0o666),
868                        uid: Some(0),
869                        gid: Some(0),
870                    },
871                    OciDevice {
872                        device_type: "c".to_string(),
873                        path: "/dev/zero".to_string(),
874                        major: Some(1),
875                        minor: Some(5),
876                        file_mode: Some(0o666),
877                        uid: Some(0),
878                        gid: Some(0),
879                    },
880                    OciDevice {
881                        device_type: "c".to_string(),
882                        path: "/dev/full".to_string(),
883                        major: Some(1),
884                        minor: Some(7),
885                        file_mode: Some(0o666),
886                        uid: Some(0),
887                        gid: Some(0),
888                    },
889                    OciDevice {
890                        device_type: "c".to_string(),
891                        path: "/dev/random".to_string(),
892                        major: Some(1),
893                        minor: Some(8),
894                        file_mode: Some(0o666),
895                        uid: Some(0),
896                        gid: Some(0),
897                    },
898                    OciDevice {
899                        device_type: "c".to_string(),
900                        path: "/dev/urandom".to_string(),
901                        major: Some(1),
902                        minor: Some(9),
903                        file_mode: Some(0o666),
904                        uid: Some(0),
905                        gid: Some(0),
906                    },
907                ],
908                seccomp: None,
909                rootfs_propagation: Some("rprivate".to_string()),
910                sysctl: HashMap::new(),
911                cgroups_path: None,
912                intel_rdt: None,
913            }),
914        }
915    }
916
917    /// Add resource limits to the config
918    pub fn with_resources(mut self, limits: &ResourceLimits) -> Self {
919        let mut resources = OciResources {
920            memory: None,
921            cpu: None,
922            pids: None,
923        };
924
925        if let Some(memory_bytes) = limits.memory_bytes {
926            resources.memory = Some(OciMemory {
927                limit: Some(memory_bytes as i64),
928            });
929        }
930
931        if let Some(quota_us) = limits.cpu_quota_us {
932            resources.cpu = Some(OciCpu {
933                quota: Some(quota_us as i64),
934                period: Some(limits.cpu_period_us),
935            });
936        }
937
938        if let Some(pids_max) = limits.pids_max {
939            resources.pids = Some(OciPids {
940                limit: pids_max as i64,
941            });
942        }
943
944        if let Some(linux) = &mut self.linux {
945            linux.resources = Some(resources);
946        }
947
948        self
949    }
950
951    /// Configure the OCI noNewPrivileges process flag.
952    pub fn with_no_new_privileges(mut self, enabled: bool) -> Self {
953        self.process.no_new_privileges = enabled;
954        self
955    }
956
957    /// Add environment variables to the OCI process config.
958    pub fn with_env(mut self, vars: &[(String, String)]) -> Self {
959        for (key, value) in vars {
960            self.process.env.push(format!("{}={}", key, value));
961        }
962        self
963    }
964
965    /// Add sd_notify socket passthrough.
966    pub fn with_sd_notify(mut self) -> Self {
967        if let Ok(notify_socket) = std::env::var("NOTIFY_SOCKET") {
968            self.process
969                .env
970                .push(format!("NOTIFY_SOCKET={}", notify_socket));
971        }
972        self
973    }
974
975    /// Add bind mounts for secrets.
976    pub fn with_secret_mounts(mut self, secrets: &[crate::container::SecretMount]) -> Self {
977        for secret in secrets {
978            self.mounts.push(OciMount {
979                destination: secret.dest.to_string_lossy().to_string(),
980                source: secret.source.to_string_lossy().to_string(),
981                mount_type: "bind".to_string(),
982                options: vec![
983                    "bind".to_string(),
984                    "ro".to_string(),
985                    "nosuid".to_string(),
986                    "nodev".to_string(),
987                    "noexec".to_string(),
988                ],
989            });
990        }
991        self
992    }
993
994    /// Set the process identity for the OCI workload.
995    pub fn with_process_identity(mut self, identity: &crate::container::ProcessIdentity) -> Self {
996        self.process.user.uid = identity.uid;
997        self.process.user.gid = identity.gid;
998        self.process.user.additional_gids = if identity.additional_gids.is_empty() {
999            None
1000        } else {
1001            Some(identity.additional_gids.clone())
1002        };
1003        self
1004    }
1005
1006    /// Add a read-only bind mount of an in-memory secret staging directory at
1007    /// `/run/secrets`, plus compatibility bind mounts for each staged secret to
1008    /// its requested container destination.
1009    pub fn with_inmemory_secret_mounts(
1010        mut self,
1011        stage_dir: &Path,
1012        secrets: &[crate::container::SecretMount],
1013    ) -> Result<Self> {
1014        self.mounts.push(OciMount {
1015            destination: "/run/secrets".to_string(),
1016            source: stage_dir.to_string_lossy().to_string(),
1017            mount_type: "bind".to_string(),
1018            options: vec![
1019                "bind".to_string(),
1020                "ro".to_string(),
1021                "nosuid".to_string(),
1022                "nodev".to_string(),
1023                "noexec".to_string(),
1024            ],
1025        });
1026
1027        for secret in secrets {
1028            let dest = normalize_container_destination(&secret.dest)?;
1029            if !secret.source.starts_with(stage_dir) {
1030                return Err(NucleusError::ConfigError(format!(
1031                    "Staged secret source {:?} must live under {:?}",
1032                    secret.source, stage_dir
1033                )));
1034            }
1035            self.mounts.push(OciMount {
1036                destination: dest.to_string_lossy().to_string(),
1037                source: secret.source.to_string_lossy().to_string(),
1038                mount_type: "bind".to_string(),
1039                options: vec![
1040                    "bind".to_string(),
1041                    "ro".to_string(),
1042                    "nosuid".to_string(),
1043                    "nodev".to_string(),
1044                    "noexec".to_string(),
1045                ],
1046            });
1047        }
1048
1049        Ok(self)
1050    }
1051
1052    /// Add bind or tmpfs volume mounts.
1053    pub fn with_volume_mounts(mut self, volumes: &[crate::container::VolumeMount]) -> Result<Self> {
1054        use crate::container::VolumeSource;
1055
1056        for volume in volumes {
1057            let dest = normalize_container_destination(&volume.dest)?;
1058            match &volume.source {
1059                VolumeSource::Bind { source } => {
1060                    crate::filesystem::validate_bind_mount_source(source)?;
1061                    let mut options = vec![
1062                        "bind".to_string(),
1063                        "nosuid".to_string(),
1064                        "nodev".to_string(),
1065                    ];
1066                    if volume.read_only {
1067                        options.push("ro".to_string());
1068                    }
1069                    self.mounts.push(OciMount {
1070                        destination: dest.to_string_lossy().to_string(),
1071                        source: source.to_string_lossy().to_string(),
1072                        mount_type: "bind".to_string(),
1073                        options,
1074                    });
1075                }
1076                VolumeSource::Tmpfs { size } => {
1077                    let mut options = vec![
1078                        "nosuid".to_string(),
1079                        "nodev".to_string(),
1080                        "mode=0755".to_string(),
1081                    ];
1082                    if volume.read_only {
1083                        options.push("ro".to_string());
1084                    }
1085                    if let Some(size) = size {
1086                        options.push(format!("size={}", size));
1087                    }
1088                    self.mounts.push(OciMount {
1089                        destination: dest.to_string_lossy().to_string(),
1090                        source: "tmpfs".to_string(),
1091                        mount_type: "tmpfs".to_string(),
1092                        options,
1093                    });
1094                }
1095            }
1096        }
1097
1098        Ok(self)
1099    }
1100
1101    /// Bind mount the host context directory into the container.
1102    ///
1103    /// The gVisor integration path expects `/context` to be writable so test
1104    /// workloads can write results back to the host.
1105    pub fn with_context_bind(mut self, context_dir: &std::path::Path) -> Self {
1106        self.mounts.push(OciMount {
1107            destination: "/context".to_string(),
1108            source: context_dir.to_string_lossy().to_string(),
1109            mount_type: "bind".to_string(),
1110            options: vec![
1111                "bind".to_string(),
1112                "ro".to_string(),
1113                "nosuid".to_string(),
1114                "nodev".to_string(),
1115            ],
1116        });
1117        self
1118    }
1119
1120    /// Add rootfs bind mounts from a pre-built rootfs path.
1121    pub fn with_rootfs_binds(mut self, rootfs_path: &std::path::Path) -> Self {
1122        let subdirs = ["bin", "sbin", "lib", "lib64", "usr", "etc", "nix"];
1123        for subdir in &subdirs {
1124            let source = rootfs_path.join(subdir);
1125            if source.exists() {
1126                self.mounts.push(OciMount {
1127                    destination: format!("/{}", subdir),
1128                    source: source.to_string_lossy().to_string(),
1129                    mount_type: "bind".to_string(),
1130                    options: vec![
1131                        "bind".to_string(),
1132                        "ro".to_string(),
1133                        "nosuid".to_string(),
1134                        "nodev".to_string(),
1135                    ],
1136                });
1137            }
1138        }
1139        self
1140    }
1141
1142    /// Replace the default namespace list with an explicit configuration.
1143    pub fn with_namespace_config(mut self, config: &NamespaceConfig) -> Self {
1144        let mut namespaces = Vec::new();
1145
1146        if config.pid {
1147            namespaces.push(OciNamespace {
1148                namespace_type: "pid".to_string(),
1149            });
1150        }
1151        if config.net {
1152            namespaces.push(OciNamespace {
1153                namespace_type: "network".to_string(),
1154            });
1155        }
1156        if config.ipc {
1157            namespaces.push(OciNamespace {
1158                namespace_type: "ipc".to_string(),
1159            });
1160        }
1161        if config.uts {
1162            namespaces.push(OciNamespace {
1163                namespace_type: "uts".to_string(),
1164            });
1165        }
1166        if config.mnt {
1167            namespaces.push(OciNamespace {
1168                namespace_type: "mount".to_string(),
1169            });
1170        }
1171        if config.cgroup {
1172            namespaces.push(OciNamespace {
1173                namespace_type: "cgroup".to_string(),
1174            });
1175        }
1176        if config.time {
1177            namespaces.push(OciNamespace {
1178                namespace_type: "time".to_string(),
1179            });
1180        }
1181        if config.user {
1182            namespaces.push(OciNamespace {
1183                namespace_type: "user".to_string(),
1184            });
1185        }
1186
1187        if let Some(linux) = &mut self.linux {
1188            linux.namespaces = Some(namespaces);
1189        }
1190
1191        self
1192    }
1193
1194    /// Add read-only bind mounts for host runtime paths.
1195    ///
1196    /// This mirrors the native fallback path for non-production containers so
1197    /// common executables such as `/bin/sh` remain available inside the OCI
1198    /// rootfs when no explicit rootfs is configured.
1199    pub fn with_host_runtime_binds(mut self) -> Self {
1200        // Use a fixed set of standard FHS paths only. Do NOT scan host $PATH,
1201        // which would expose arbitrary host directories inside the container.
1202        let host_paths: BTreeSet<String> =
1203            ["/bin", "/sbin", "/usr", "/lib", "/lib64", "/nix/store"]
1204                .iter()
1205                .map(|s| s.to_string())
1206                .collect();
1207
1208        for host_path in host_paths {
1209            let source = Path::new(&host_path);
1210            if !source.exists() {
1211                continue;
1212            }
1213
1214            self.mounts.push(OciMount {
1215                destination: host_path.clone(),
1216                source: source.to_string_lossy().to_string(),
1217                mount_type: "bind".to_string(),
1218                options: vec![
1219                    "bind".to_string(),
1220                    "ro".to_string(),
1221                    "nosuid".to_string(),
1222                    "nodev".to_string(),
1223                ],
1224            });
1225        }
1226        self
1227    }
1228
1229    /// Add user namespace configuration
1230    pub fn with_user_namespace(mut self) -> Self {
1231        if let Some(linux) = &mut self.linux {
1232            if let Some(namespaces) = &mut linux.namespaces {
1233                namespaces.push(OciNamespace {
1234                    namespace_type: "user".to_string(),
1235                });
1236            }
1237        }
1238        self
1239    }
1240
1241    /// Remove the OCI network namespace entry so runsc inherits the process
1242    /// network namespace that Nucleus prepared before exec.
1243    pub fn without_network_namespace(mut self) -> Self {
1244        if let Some(linux) = &mut self.linux {
1245            if let Some(namespaces) = &mut linux.namespaces {
1246                namespaces.retain(|ns| ns.namespace_type != "network");
1247            }
1248        }
1249
1250        self
1251    }
1252
1253    /// Configure gVisor's true rootless OCI path.
1254    ///
1255    /// gVisor expects UID/GID mappings in the OCI spec for this mode, and its
1256    /// rootless OCI implementation does not currently support a network
1257    /// namespace entry in the spec. We still control networking through
1258    /// runsc's top-level `--network` flag.
1259    pub fn with_rootless_user_namespace(mut self, config: &UserNamespaceConfig) -> Self {
1260        if let Some(linux) = &mut self.linux {
1261            if let Some(namespaces) = &mut linux.namespaces {
1262                namespaces.retain(|ns| ns.namespace_type != "network");
1263                if !namespaces.iter().any(|ns| ns.namespace_type == "user") {
1264                    namespaces.push(OciNamespace {
1265                        namespace_type: "user".to_string(),
1266                    });
1267                }
1268            }
1269            linux.uid_mappings = config.uid_mappings.iter().map(OciIdMapping::from).collect();
1270            linux.gid_mappings = config.gid_mappings.iter().map(OciIdMapping::from).collect();
1271        }
1272        self
1273    }
1274
1275    /// Set OCI lifecycle hooks on the config.
1276    pub fn with_hooks(mut self, hooks: OciHooks) -> Self {
1277        if hooks.is_empty() {
1278            self.hooks = None;
1279        } else {
1280            self.hooks = Some(hooks);
1281        }
1282        self
1283    }
1284
1285    /// Set process rlimits from the Nucleus runtime defaults and configured limits.
1286    ///
1287    /// Mirrors the RLIMIT backstops applied in-process for native containers
1288    /// (runtime.rs), expressed as OCI config so gVisor can enforce them.
1289    pub fn with_rlimits(mut self, limits: &ResourceLimits) -> Self {
1290        let mut rlimits = Vec::with_capacity(3);
1291
1292        if let Some(nproc_limit) = limits.pids_max {
1293            rlimits.push(OciRlimit {
1294                limit_type: "RLIMIT_NPROC".to_string(),
1295                hard: nproc_limit,
1296                soft: nproc_limit,
1297            });
1298        }
1299
1300        rlimits.push(OciRlimit {
1301            limit_type: "RLIMIT_NOFILE".to_string(),
1302            hard: 1024,
1303            soft: 1024,
1304        });
1305
1306        let memlock_limit = limits.memlock_bytes.unwrap_or(64 * 1024);
1307        rlimits.push(OciRlimit {
1308            limit_type: "RLIMIT_MEMLOCK".to_string(),
1309            hard: memlock_limit,
1310            soft: memlock_limit,
1311        });
1312
1313        self.process.rlimits = rlimits;
1314        self
1315    }
1316
1317    /// Set the linux.seccomp section from an OCI seccomp config.
1318    pub fn with_seccomp(mut self, seccomp: OciSeccomp) -> Self {
1319        if let Some(linux) = &mut self.linux {
1320            linux.seccomp = Some(seccomp);
1321        }
1322        self
1323    }
1324
1325    /// Set the linux.cgroupsPath field.
1326    pub fn with_cgroups_path(mut self, path: String) -> Self {
1327        if let Some(linux) = &mut self.linux {
1328            linux.cgroups_path = Some(path);
1329        }
1330        self
1331    }
1332
1333    /// Set sysctl key-value pairs on the linux config.
1334    pub fn with_sysctl(mut self, sysctl: HashMap<String, String>) -> Self {
1335        if let Some(linux) = &mut self.linux {
1336            linux.sysctl = sysctl;
1337        }
1338        self
1339    }
1340
1341    /// Set annotations on the OCI config.
1342    pub fn with_annotations(mut self, annotations: HashMap<String, String>) -> Self {
1343        self.annotations = annotations;
1344        self
1345    }
1346}
1347
1348impl From<&IdMapping> for OciIdMapping {
1349    fn from(mapping: &IdMapping) -> Self {
1350        Self {
1351            container_id: mapping.container_id,
1352            host_id: mapping.host_id,
1353            size: mapping.count,
1354        }
1355    }
1356}
1357
1358/// OCI Bundle manager
1359///
1360/// Creates and manages OCI-compliant bundles for gVisor
1361pub struct OciBundle {
1362    bundle_path: PathBuf,
1363    config: OciConfig,
1364}
1365
1366impl OciBundle {
1367    /// Create a new OCI bundle
1368    pub fn new(bundle_path: PathBuf, config: OciConfig) -> Self {
1369        Self {
1370            bundle_path,
1371            config,
1372        }
1373    }
1374
1375    /// Create the bundle directory structure and write config.json
1376    pub fn create(&self) -> Result<()> {
1377        info!("Creating OCI bundle at {:?}", self.bundle_path);
1378
1379        // Create bundle directory
1380        fs::create_dir_all(&self.bundle_path).map_err(|e| {
1381            NucleusError::GVisorError(format!(
1382                "Failed to create bundle directory {:?}: {}",
1383                self.bundle_path, e
1384            ))
1385        })?;
1386        fs::set_permissions(&self.bundle_path, fs::Permissions::from_mode(0o700)).map_err(|e| {
1387            NucleusError::GVisorError(format!(
1388                "Failed to secure bundle directory permissions {:?}: {}",
1389                self.bundle_path, e
1390            ))
1391        })?;
1392
1393        // Create rootfs directory
1394        let rootfs = self.bundle_path.join("rootfs");
1395        fs::create_dir_all(&rootfs).map_err(|e| {
1396            NucleusError::GVisorError(format!("Failed to create rootfs directory: {}", e))
1397        })?;
1398        // The rootfs is the container's "/" – it must be traversable by the
1399        // container UID which may be non-root (via --user).  Mode 0755 matches
1400        // the standard Linux root directory permission and lets gVisor's VFS
1401        // permit path traversal for any UID.
1402        fs::set_permissions(&rootfs, fs::Permissions::from_mode(0o755)).map_err(|e| {
1403            NucleusError::GVisorError(format!(
1404                "Failed to set rootfs directory permissions {:?}: {}",
1405                rootfs, e
1406            ))
1407        })?;
1408
1409        // Write config.json
1410        let config_path = self.bundle_path.join("config.json");
1411        let config_json = serde_json::to_string_pretty(&self.config).map_err(|e| {
1412            NucleusError::GVisorError(format!("Failed to serialize OCI config: {}", e))
1413        })?;
1414
1415        // L5: Use O_NOFOLLOW via custom_flags to prevent writing through symlinks
1416        let mut file = OpenOptions::new()
1417            .create(true)
1418            .truncate(true)
1419            .write(true)
1420            .mode(0o600)
1421            .custom_flags(libc::O_NOFOLLOW)
1422            .open(&config_path)
1423            .map_err(|e| NucleusError::GVisorError(format!("Failed to open config.json: {}", e)))?;
1424        file.write_all(config_json.as_bytes()).map_err(|e| {
1425            NucleusError::GVisorError(format!("Failed to write config.json: {}", e))
1426        })?;
1427        file.sync_all()
1428            .map_err(|e| NucleusError::GVisorError(format!("Failed to sync config.json: {}", e)))?;
1429
1430        debug!("Created OCI bundle structure at {:?}", self.bundle_path);
1431
1432        Ok(())
1433    }
1434
1435    /// Get the rootfs path
1436    pub fn rootfs_path(&self) -> PathBuf {
1437        self.bundle_path.join("rootfs")
1438    }
1439
1440    /// Get the bundle path
1441    pub fn bundle_path(&self) -> &Path {
1442        &self.bundle_path
1443    }
1444
1445    /// Clean up the bundle
1446    pub fn cleanup(&self) -> Result<()> {
1447        if self.bundle_path.exists() {
1448            fs::remove_dir_all(&self.bundle_path).map_err(|e| {
1449                NucleusError::GVisorError(format!("Failed to cleanup bundle: {}", e))
1450            })?;
1451            debug!("Cleaned up OCI bundle at {:?}", self.bundle_path);
1452        }
1453        Ok(())
1454    }
1455}
1456
1457#[cfg(test)]
1458mod tests {
1459    use super::*;
1460    use tempfile::TempDir;
1461
1462    #[test]
1463    fn test_oci_config_new() {
1464        let config = OciConfig::new(vec!["/bin/sh".to_string()], Some("test".to_string()));
1465
1466        assert_eq!(config.oci_version, "1.0.2");
1467        assert_eq!(config.root.path, "rootfs");
1468        assert_eq!(config.process.args, vec!["/bin/sh"]);
1469        assert_eq!(config.hostname, Some("test".to_string()));
1470    }
1471
1472    #[test]
1473    fn test_oci_config_with_resources() {
1474        let limits = ResourceLimits::unlimited()
1475            .with_memory("512M")
1476            .unwrap()
1477            .with_cpu_cores(2.0)
1478            .unwrap();
1479
1480        let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_resources(&limits);
1481
1482        assert!(config.linux.is_some());
1483        let linux = config.linux.unwrap();
1484        assert!(linux.resources.is_some());
1485
1486        let resources = linux.resources.unwrap();
1487        assert!(resources.memory.is_some());
1488        assert!(resources.cpu.is_some());
1489    }
1490
1491    #[test]
1492    fn test_oci_bundle_create() {
1493        let temp_dir = TempDir::new().unwrap();
1494        let bundle_path = temp_dir.path().join("test-bundle");
1495
1496        let config = OciConfig::new(vec!["/bin/sh".to_string()], None);
1497        let bundle = OciBundle::new(bundle_path.clone(), config);
1498
1499        bundle.create().unwrap();
1500
1501        assert!(bundle_path.exists());
1502        assert!(bundle_path.join("rootfs").exists());
1503        assert!(bundle_path.join("config.json").exists());
1504
1505        bundle.cleanup().unwrap();
1506        assert!(!bundle_path.exists());
1507    }
1508
1509    #[test]
1510    fn test_oci_config_serialization() {
1511        let config = OciConfig::new(vec!["/bin/sh".to_string()], Some("test".to_string()));
1512
1513        let json = serde_json::to_string_pretty(&config).unwrap();
1514        assert!(json.contains("ociVersion"));
1515        assert!(json.contains("1.0.2"));
1516        assert!(json.contains("/bin/sh"));
1517
1518        // Test deserialization
1519        let deserialized: OciConfig = serde_json::from_str(&json).unwrap();
1520        assert_eq!(deserialized.oci_version, config.oci_version);
1521        assert_eq!(deserialized.process.args, config.process.args);
1522    }
1523
1524    #[test]
1525    fn test_host_runtime_binds_uses_fixed_paths_not_host_path() {
1526        // with_host_runtime_binds must NOT scan the host $PATH. Only standard
1527        // FHS paths should be bind-mounted to prevent leaking arbitrary host
1528        // directories into the container. Verify by setting a distinctive PATH
1529        // and checking that none of its entries appear in the resulting mounts.
1530        std::env::set_var("PATH", "/tmp/evil-inject-path/bin:/opt/attacker/sbin");
1531        let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_host_runtime_binds();
1532        let mount_dests: Vec<&str> = config
1533            .mounts
1534            .iter()
1535            .map(|m| m.destination.as_str())
1536            .collect();
1537        let mount_srcs: Vec<&str> = config.mounts.iter().map(|m| m.source.as_str()).collect();
1538        // Verify no mount references the injected PATH entries
1539        for path in &["/tmp/evil-inject-path", "/opt/attacker"] {
1540            assert!(
1541                !mount_dests.iter().any(|d| d.contains(path)),
1542                "with_host_runtime_binds must not use host $PATH – found {:?} in mount destinations",
1543                path
1544            );
1545            assert!(
1546                !mount_srcs.iter().any(|s| s.contains(path)),
1547                "with_host_runtime_binds must not use host $PATH – found {:?} in mount sources",
1548                path
1549            );
1550        }
1551        // Verify only standard FHS paths are mounted
1552        let allowed_prefixes = ["/bin", "/sbin", "/usr", "/lib", "/lib64", "/nix/store"];
1553        for mount in &config.mounts {
1554            if mount.mount_type == "bind" {
1555                assert!(
1556                    allowed_prefixes
1557                        .iter()
1558                        .any(|p| mount.destination.starts_with(p)),
1559                    "unexpected bind mount destination: {} – only FHS paths allowed",
1560                    mount.destination
1561                );
1562            }
1563        }
1564    }
1565
1566    #[test]
1567    fn test_volume_mounts_include_bind_and_tmpfs_options() {
1568        let tmp = tempfile::TempDir::new().unwrap();
1569        let config = OciConfig::new(vec!["/bin/sh".to_string()], None)
1570            .with_volume_mounts(&[
1571                crate::container::VolumeMount {
1572                    source: crate::container::VolumeSource::Bind {
1573                        source: tmp.path().to_path_buf(),
1574                    },
1575                    dest: std::path::PathBuf::from("/var/lib/app"),
1576                    read_only: true,
1577                },
1578                crate::container::VolumeMount {
1579                    source: crate::container::VolumeSource::Tmpfs {
1580                        size: Some("64M".to_string()),
1581                    },
1582                    dest: std::path::PathBuf::from("/var/cache/app"),
1583                    read_only: false,
1584                },
1585            ])
1586            .unwrap();
1587
1588        assert!(config.mounts.iter().any(|mount| {
1589            mount.destination == "/var/lib/app"
1590                && mount.mount_type == "bind"
1591                && mount.options.contains(&"ro".to_string())
1592        }));
1593        assert!(config.mounts.iter().any(|mount| {
1594            mount.destination == "/var/cache/app"
1595                && mount.mount_type == "tmpfs"
1596                && mount.options.contains(&"size=64M".to_string())
1597        }));
1598    }
1599
1600    #[test]
1601    fn test_volume_mounts_reject_sensitive_host_sources() {
1602        let err = OciConfig::new(vec!["/bin/sh".to_string()], None)
1603            .with_volume_mounts(&[crate::container::VolumeMount {
1604                source: crate::container::VolumeSource::Bind {
1605                    source: std::path::PathBuf::from("/proc/sys"),
1606                },
1607                dest: std::path::PathBuf::from("/host-proc"),
1608                read_only: true,
1609            }])
1610            .unwrap_err();
1611
1612        assert!(err.to_string().contains("sensitive host path"));
1613    }
1614
1615    #[test]
1616    fn test_oci_config_with_process_identity() {
1617        let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_process_identity(
1618            &crate::container::ProcessIdentity {
1619                uid: 1001,
1620                gid: 1002,
1621                additional_gids: vec![1003, 1004],
1622            },
1623        );
1624
1625        assert_eq!(config.process.user.uid, 1001);
1626        assert_eq!(config.process.user.gid, 1002);
1627        assert_eq!(config.process.user.additional_gids, Some(vec![1003, 1004]));
1628    }
1629
1630    #[test]
1631    fn test_oci_config_with_rlimits_uses_configured_memlock() {
1632        let limits = ResourceLimits::default()
1633            .with_pids(99)
1634            .unwrap()
1635            .with_memlock("8M")
1636            .unwrap();
1637
1638        let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_rlimits(&limits);
1639
1640        assert!(config.process.rlimits.iter().any(|limit| {
1641            limit.limit_type == "RLIMIT_NPROC" && limit.soft == 99 && limit.hard == 99
1642        }));
1643        assert!(config.process.rlimits.iter().any(|limit| {
1644            limit.limit_type == "RLIMIT_MEMLOCK"
1645                && limit.soft == 8 * 1024 * 1024
1646                && limit.hard == 8 * 1024 * 1024
1647        }));
1648    }
1649
1650    #[test]
1651    fn test_oci_config_with_rlimits_omits_nproc_when_unlimited() {
1652        let limits = ResourceLimits {
1653            pids_max: None,
1654            ..ResourceLimits::default()
1655        };
1656
1657        let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_rlimits(&limits);
1658
1659        assert!(
1660            !config
1661                .process
1662                .rlimits
1663                .iter()
1664                .any(|limit| limit.limit_type == "RLIMIT_NPROC"),
1665            "RLIMIT_NPROC must be omitted when pids_max is unlimited"
1666        );
1667    }
1668
1669    #[test]
1670    fn test_oci_config_uses_hardcoded_path_not_host() {
1671        // C-3: PATH must be a hardcoded minimal value, never the host's PATH.
1672        // This prevents leaking host filesystem layout into the container.
1673        std::env::set_var("PATH", "/nix/store/secret-hash/bin:/home/user/.local/bin");
1674        let config = OciConfig::new(vec!["/bin/sh".to_string()], None);
1675        let path_env = config
1676            .process
1677            .env
1678            .iter()
1679            .find(|e| e.starts_with("PATH="))
1680            .expect("PATH env must be set");
1681        assert_eq!(
1682            path_env, "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1683            "OCI config must not leak host PATH"
1684        );
1685        assert!(
1686            !path_env.contains("/nix/store/secret"),
1687            "Host PATH must not leak into container"
1688        );
1689    }
1690
1691    #[test]
1692    fn test_oci_hooks_serialization_roundtrip() {
1693        let hooks = OciHooks {
1694            create_runtime: vec![OciHook {
1695                path: "/usr/bin/hook1".to_string(),
1696                args: vec!["hook1".to_string(), "--arg1".to_string()],
1697                env: vec!["FOO=bar".to_string()],
1698                timeout: Some(10),
1699            }],
1700            create_container: vec![],
1701            start_container: vec![],
1702            poststart: vec![OciHook {
1703                path: "/usr/bin/hook2".to_string(),
1704                args: vec![],
1705                env: vec![],
1706                timeout: None,
1707            }],
1708            poststop: vec![],
1709        };
1710
1711        let json = serde_json::to_string_pretty(&hooks).unwrap();
1712        assert!(json.contains("createRuntime"));
1713        assert!(json.contains("/usr/bin/hook1"));
1714        assert!(!json.contains("createContainer")); // empty vecs are skipped
1715
1716        let deserialized: OciHooks = serde_json::from_str(&json).unwrap();
1717        assert_eq!(deserialized.create_runtime.len(), 1);
1718        assert_eq!(deserialized.create_runtime[0].path, "/usr/bin/hook1");
1719        assert_eq!(deserialized.create_runtime[0].timeout, Some(10));
1720        assert_eq!(deserialized.poststart.len(), 1);
1721        assert!(deserialized.create_container.is_empty());
1722    }
1723
1724    #[test]
1725    fn test_oci_hooks_is_empty() {
1726        let empty = OciHooks::default();
1727        assert!(empty.is_empty());
1728
1729        let not_empty = OciHooks {
1730            poststop: vec![OciHook {
1731                path: "/bin/cleanup".to_string(),
1732                args: vec![],
1733                env: vec![],
1734                timeout: None,
1735            }],
1736            ..Default::default()
1737        };
1738        assert!(!not_empty.is_empty());
1739    }
1740
1741    #[test]
1742    fn test_oci_config_with_hooks() {
1743        let hooks = OciHooks {
1744            create_runtime: vec![OciHook {
1745                path: "/usr/bin/setup".to_string(),
1746                args: vec![],
1747                env: vec![],
1748                timeout: None,
1749            }],
1750            ..Default::default()
1751        };
1752
1753        let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_hooks(hooks);
1754        assert!(config.hooks.is_some());
1755
1756        let json = serde_json::to_string_pretty(&config).unwrap();
1757        assert!(json.contains("hooks"));
1758        assert!(json.contains("createRuntime"));
1759
1760        let deserialized: OciConfig = serde_json::from_str(&json).unwrap();
1761        assert!(deserialized.hooks.is_some());
1762        assert_eq!(deserialized.hooks.unwrap().create_runtime.len(), 1);
1763    }
1764
1765    #[test]
1766    fn test_oci_config_with_empty_hooks_serializes_without_hooks() {
1767        let config =
1768            OciConfig::new(vec!["/bin/sh".to_string()], None).with_hooks(OciHooks::default());
1769        assert!(config.hooks.is_none()); // empty hooks are set to None
1770
1771        let json = serde_json::to_string_pretty(&config).unwrap();
1772        assert!(!json.contains("hooks"));
1773    }
1774
1775    #[test]
1776    fn test_oci_hook_rejects_relative_path() {
1777        let hook = OciHook {
1778            path: "relative/path".to_string(),
1779            args: vec![],
1780            env: vec![],
1781            timeout: None,
1782        };
1783        let state = OciContainerState {
1784            oci_version: "1.0.2".to_string(),
1785            id: "test".to_string(),
1786            status: OciStatus::Creating,
1787            pid: 1234,
1788            bundle: "/tmp/bundle".to_string(),
1789        };
1790        let result = OciHooks::run_hooks(&[hook], &state, "test");
1791        assert!(result.is_err());
1792        let err_msg = result.unwrap_err().to_string();
1793        assert!(err_msg.contains("absolute"), "error: {}", err_msg);
1794    }
1795
1796    /// Read the original PATH from /proc/self/environ.
1797    ///
1798    /// Other tests in this module call `std::env::set_var("PATH", ...)` which
1799    /// corrupts the process environment. /proc/self/environ is frozen at
1800    /// process startup so it always reflects the real PATH.
1801    fn original_path() -> String {
1802        if let Ok(environ) = std::fs::read("/proc/self/environ") {
1803            for entry in environ.split(|&b| b == 0) {
1804                if let Ok(s) = std::str::from_utf8(entry) {
1805                    if let Some(val) = s.strip_prefix("PATH=") {
1806                        return val.to_string();
1807                    }
1808                }
1809            }
1810        }
1811        String::new()
1812    }
1813
1814    /// Resolve the absolute path to bash for test scripts.
1815    fn find_bash() -> String {
1816        let candidates = ["/bin/bash", "/usr/bin/bash"];
1817        for c in &candidates {
1818            if std::path::Path::new(c).exists() {
1819                return c.to_string();
1820            }
1821        }
1822        for dir in original_path().split(':') {
1823            let candidate = std::path::PathBuf::from(dir).join("bash");
1824            if candidate.exists() {
1825                return candidate.to_string_lossy().to_string();
1826            }
1827        }
1828        panic!("Cannot find bash binary for test");
1829    }
1830
1831    /// Write a script file with proper shebang and ensure it's fully flushed before execution.
1832    /// Embeds the original PATH so scripts can find utilities like `cat`/`touch`
1833    /// even when other tests have corrupted the process PATH.
1834    fn write_script(path: &std::path::Path, body: &str) {
1835        use std::io::Write as IoWrite;
1836        let bash = find_bash();
1837        let orig_path = original_path();
1838        let content = format!("#!{}\nexport PATH='{}'\n{}", bash, orig_path, body);
1839        let mut f = OpenOptions::new()
1840            .create(true)
1841            .truncate(true)
1842            .write(true)
1843            .mode(0o755)
1844            .open(path)
1845            .unwrap();
1846        f.write_all(content.as_bytes()).unwrap();
1847        f.sync_all().unwrap();
1848        drop(f);
1849    }
1850
1851    #[test]
1852    fn test_oci_hook_executes_successfully() {
1853        let temp_dir = TempDir::new().unwrap();
1854        let hook_script = temp_dir.path().join("hook.sh");
1855        let output_file = temp_dir.path().join("output.json");
1856
1857        write_script(
1858            &hook_script,
1859            &format!("cat > {}\n", output_file.to_string_lossy()),
1860        );
1861
1862        let hook = OciHook {
1863            path: hook_script.to_string_lossy().to_string(),
1864            args: vec![],
1865            env: vec![],
1866            timeout: Some(5),
1867        };
1868        let state = OciContainerState {
1869            oci_version: "1.0.2".to_string(),
1870            id: "test-container".to_string(),
1871            status: OciStatus::Creating,
1872            pid: 12345,
1873            bundle: "/tmp/test-bundle".to_string(),
1874        };
1875
1876        OciHooks::run_hooks(&[hook], &state, "createRuntime").unwrap();
1877
1878        // Verify the hook received the container state JSON on stdin
1879        let written = std::fs::read_to_string(&output_file).unwrap();
1880        let parsed: serde_json::Value = serde_json::from_str(&written).unwrap();
1881        assert_eq!(parsed["id"], "test-container");
1882        assert_eq!(parsed["pid"], 12345);
1883        assert_eq!(parsed["status"], "creating");
1884    }
1885
1886    #[test]
1887    fn test_oci_hook_retries_text_file_busy_spawn() {
1888        let temp_dir = TempDir::new().unwrap();
1889        let hook_script = temp_dir.path().join("hook.sh");
1890        let output_file = temp_dir.path().join("output.json");
1891
1892        write_script(
1893            &hook_script,
1894            &format!("cat > {}\n", output_file.to_string_lossy()),
1895        );
1896
1897        let (ready_tx, ready_rx) = std::sync::mpsc::channel();
1898        let busy_script = hook_script.clone();
1899        let busy_handle = std::thread::spawn(move || {
1900            let _busy_file = OpenOptions::new().write(true).open(&busy_script).unwrap();
1901            ready_tx.send(()).unwrap();
1902            std::thread::sleep(std::time::Duration::from_millis(100));
1903        });
1904        ready_rx.recv().unwrap();
1905
1906        let hook = OciHook {
1907            path: hook_script.to_string_lossy().to_string(),
1908            args: vec![],
1909            env: vec![],
1910            timeout: Some(5),
1911        };
1912        let state = OciContainerState {
1913            oci_version: "1.0.2".to_string(),
1914            id: "test-container".to_string(),
1915            status: OciStatus::Creating,
1916            pid: 12345,
1917            bundle: "/tmp/test-bundle".to_string(),
1918        };
1919
1920        let result = OciHooks::run_hooks(&[hook], &state, "createRuntime");
1921        busy_handle.join().unwrap();
1922        result.unwrap();
1923
1924        let written = std::fs::read_to_string(&output_file).unwrap();
1925        let parsed: serde_json::Value = serde_json::from_str(&written).unwrap();
1926        assert_eq!(parsed["id"], "test-container");
1927    }
1928
1929    #[test]
1930    fn test_oci_hook_nonzero_exit_is_error() {
1931        let temp_dir = TempDir::new().unwrap();
1932        let hook_script = temp_dir.path().join("fail.sh");
1933        write_script(&hook_script, "exit 1\n");
1934
1935        let hook = OciHook {
1936            path: hook_script.to_string_lossy().to_string(),
1937            args: vec![],
1938            env: vec![],
1939            timeout: Some(5),
1940        };
1941        let state = OciContainerState {
1942            oci_version: "1.0.2".to_string(),
1943            id: "test".to_string(),
1944            status: OciStatus::Creating,
1945            pid: 1,
1946            bundle: "".to_string(),
1947        };
1948
1949        let result = OciHooks::run_hooks(&[hook], &state, "test");
1950        assert!(result.is_err());
1951        assert!(result
1952            .unwrap_err()
1953            .to_string()
1954            .contains("exited with status"));
1955    }
1956
1957    #[test]
1958    fn test_oci_hooks_best_effort_continues_on_failure() {
1959        let temp_dir = TempDir::new().unwrap();
1960        let fail_script = temp_dir.path().join("fail.sh");
1961        write_script(&fail_script, "exit 1\n");
1962
1963        let marker = temp_dir.path().join("ran");
1964        let ok_script = temp_dir.path().join("ok.sh");
1965        write_script(&ok_script, &format!("touch {}\n", marker.to_string_lossy()));
1966
1967        let hooks = vec![
1968            OciHook {
1969                path: fail_script.to_string_lossy().to_string(),
1970                args: vec![],
1971                env: vec![],
1972                timeout: Some(5),
1973            },
1974            OciHook {
1975                path: ok_script.to_string_lossy().to_string(),
1976                args: vec![],
1977                env: vec![],
1978                timeout: Some(5),
1979            },
1980        ];
1981        let state = OciContainerState {
1982            oci_version: "1.0.2".to_string(),
1983            id: "test".to_string(),
1984            status: OciStatus::Stopped,
1985            pid: 0,
1986            bundle: "".to_string(),
1987        };
1988
1989        // best_effort should not panic or return error
1990        OciHooks::run_hooks_best_effort(&hooks, &state, "poststop");
1991        // Second hook should have run despite first failing
1992        assert!(marker.exists(), "second hook should run after first fails");
1993    }
1994}