Skip to main content

nucleus/oci/
mod.rs

1use crate::container::OciStatus;
2use crate::error::{NucleusError, Result};
3use crate::filesystem::normalize_container_destination;
4use crate::isolation::{IdMapping, NamespaceConfig, UserNamespaceConfig};
5use crate::resources::ResourceLimits;
6use serde::{Deserialize, Serialize};
7use std::collections::{BTreeSet, HashMap};
8use std::fs;
9use std::fs::OpenOptions;
10use std::io::Write;
11use std::os::unix::fs::{OpenOptionsExt, PermissionsExt};
12use std::path::{Path, PathBuf};
13use tracing::{debug, info, warn};
14
15/// OCI Runtime Specification configuration
16///
17/// This implements a subset of the OCI runtime spec for gVisor compatibility
18/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config.md>
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct OciConfig {
21    #[serde(rename = "ociVersion")]
22    pub oci_version: String,
23
24    pub root: OciRoot,
25    pub process: OciProcess,
26    pub hostname: Option<String>,
27    pub mounts: Vec<OciMount>,
28    pub linux: Option<OciLinux>,
29    #[serde(default, skip_serializing_if = "Option::is_none")]
30    pub hooks: Option<OciHooks>,
31    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
32    pub annotations: HashMap<String, String>,
33}
34
35#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct OciRoot {
37    pub path: String,
38    pub readonly: bool,
39}
40
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct OciProcess {
43    pub terminal: bool,
44    pub user: OciUser,
45    pub args: Vec<String>,
46    pub env: Vec<String>,
47    pub cwd: String,
48    #[serde(rename = "noNewPrivileges")]
49    pub no_new_privileges: bool,
50    pub capabilities: Option<OciCapabilities>,
51    #[serde(default, skip_serializing_if = "Vec::is_empty")]
52    pub rlimits: Vec<OciRlimit>,
53    #[serde(
54        rename = "consoleSize",
55        default,
56        skip_serializing_if = "Option::is_none"
57    )]
58    pub console_size: Option<OciConsoleSize>,
59    #[serde(
60        rename = "apparmorProfile",
61        default,
62        skip_serializing_if = "Option::is_none"
63    )]
64    pub apparmor_profile: Option<String>,
65    #[serde(
66        rename = "selinuxLabel",
67        default,
68        skip_serializing_if = "Option::is_none"
69    )]
70    pub selinux_label: Option<String>,
71}
72
73#[derive(Debug, Clone, Serialize, Deserialize)]
74pub struct OciUser {
75    pub uid: u32,
76    pub gid: u32,
77    #[serde(skip_serializing_if = "Option::is_none")]
78    pub additional_gids: Option<Vec<u32>>,
79}
80
81#[derive(Debug, Clone, Serialize, Deserialize)]
82pub struct OciCapabilities {
83    pub bounding: Vec<String>,
84    pub effective: Vec<String>,
85    pub inheritable: Vec<String>,
86    pub permitted: Vec<String>,
87    pub ambient: Vec<String>,
88}
89
90#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct OciMount {
92    pub destination: String,
93    pub source: String,
94    #[serde(rename = "type")]
95    pub mount_type: String,
96    pub options: Vec<String>,
97}
98
99#[derive(Debug, Clone, Serialize, Deserialize)]
100pub struct OciLinux {
101    #[serde(skip_serializing_if = "Option::is_none")]
102    pub namespaces: Option<Vec<OciNamespace>>,
103    #[serde(skip_serializing_if = "Option::is_none")]
104    pub resources: Option<OciResources>,
105    #[serde(rename = "uidMappings", skip_serializing_if = "Vec::is_empty", default)]
106    pub uid_mappings: Vec<OciIdMapping>,
107    #[serde(rename = "gidMappings", skip_serializing_if = "Vec::is_empty", default)]
108    pub gid_mappings: Vec<OciIdMapping>,
109    #[serde(rename = "maskedPaths", skip_serializing_if = "Vec::is_empty", default)]
110    pub masked_paths: Vec<String>,
111    #[serde(
112        rename = "readonlyPaths",
113        skip_serializing_if = "Vec::is_empty",
114        default
115    )]
116    pub readonly_paths: Vec<String>,
117    #[serde(default, skip_serializing_if = "Vec::is_empty")]
118    pub devices: Vec<OciDevice>,
119    #[serde(default, skip_serializing_if = "Option::is_none")]
120    pub seccomp: Option<OciSeccomp>,
121    #[serde(
122        rename = "rootfsPropagation",
123        default,
124        skip_serializing_if = "Option::is_none"
125    )]
126    pub rootfs_propagation: Option<String>,
127    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
128    pub sysctl: HashMap<String, String>,
129    #[serde(
130        rename = "cgroupsPath",
131        default,
132        skip_serializing_if = "Option::is_none"
133    )]
134    pub cgroups_path: Option<String>,
135    #[serde(rename = "intelRdt", default, skip_serializing_if = "Option::is_none")]
136    pub intel_rdt: Option<OciIntelRdt>,
137}
138
139#[derive(Debug, Clone, Serialize, Deserialize)]
140pub struct OciNamespace {
141    #[serde(rename = "type")]
142    pub namespace_type: String,
143}
144
145#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
146pub struct OciIdMapping {
147    #[serde(rename = "containerID")]
148    pub container_id: u32,
149    #[serde(rename = "hostID")]
150    pub host_id: u32,
151    pub size: u32,
152}
153
154#[derive(Debug, Clone, Serialize, Deserialize)]
155pub struct OciResources {
156    #[serde(skip_serializing_if = "Option::is_none")]
157    pub memory: Option<OciMemory>,
158    #[serde(skip_serializing_if = "Option::is_none")]
159    pub cpu: Option<OciCpu>,
160    #[serde(skip_serializing_if = "Option::is_none")]
161    pub pids: Option<OciPids>,
162}
163
164#[derive(Debug, Clone, Serialize, Deserialize)]
165pub struct OciMemory {
166    #[serde(skip_serializing_if = "Option::is_none")]
167    pub limit: Option<i64>,
168}
169
170#[derive(Debug, Clone, Serialize, Deserialize)]
171pub struct OciCpu {
172    #[serde(skip_serializing_if = "Option::is_none")]
173    pub quota: Option<i64>,
174    #[serde(skip_serializing_if = "Option::is_none")]
175    pub period: Option<u64>,
176}
177
178#[derive(Debug, Clone, Serialize, Deserialize)]
179pub struct OciPids {
180    pub limit: i64,
181}
182
183/// OCI process resource limit.
184///
185/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config.md#posix-process>
186#[derive(Debug, Clone, Serialize, Deserialize)]
187pub struct OciRlimit {
188    /// Resource type (e.g. "RLIMIT_NOFILE", "RLIMIT_NPROC")
189    #[serde(rename = "type")]
190    pub limit_type: String,
191    /// Hard limit
192    pub hard: u64,
193    /// Soft limit
194    pub soft: u64,
195}
196
197/// OCI console size for terminal-attached processes.
198#[derive(Debug, Clone, Serialize, Deserialize)]
199pub struct OciConsoleSize {
200    pub height: u32,
201    pub width: u32,
202}
203
204/// OCI linux device entry.
205///
206/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#devices>
207#[derive(Debug, Clone, Serialize, Deserialize)]
208pub struct OciDevice {
209    /// Device type: "c" (char), "b" (block), "u" (unbuffered), "p" (FIFO)
210    #[serde(rename = "type")]
211    pub device_type: String,
212    /// Device path inside the container
213    pub path: String,
214    /// Major number
215    #[serde(skip_serializing_if = "Option::is_none")]
216    pub major: Option<i64>,
217    /// Minor number
218    #[serde(skip_serializing_if = "Option::is_none")]
219    pub minor: Option<i64>,
220    /// File mode (permissions)
221    #[serde(rename = "fileMode", skip_serializing_if = "Option::is_none")]
222    pub file_mode: Option<u32>,
223    /// UID of the device owner
224    #[serde(skip_serializing_if = "Option::is_none")]
225    pub uid: Option<u32>,
226    /// GID of the device owner
227    #[serde(skip_serializing_if = "Option::is_none")]
228    pub gid: Option<u32>,
229}
230
231/// OCI seccomp configuration.
232///
233/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#seccomp>
234#[derive(Debug, Clone, Serialize, Deserialize)]
235pub struct OciSeccomp {
236    /// Default action when no rule matches (e.g. "SCMP_ACT_ERRNO", "SCMP_ACT_ALLOW")
237    #[serde(rename = "defaultAction")]
238    pub default_action: String,
239    /// Target architectures
240    #[serde(default, skip_serializing_if = "Vec::is_empty")]
241    pub architectures: Vec<String>,
242    /// Syscall rules
243    #[serde(default, skip_serializing_if = "Vec::is_empty")]
244    pub syscalls: Vec<OciSeccompSyscall>,
245}
246
247/// A single seccomp syscall rule.
248#[derive(Debug, Clone, Serialize, Deserialize)]
249pub struct OciSeccompSyscall {
250    /// Syscall names this rule applies to
251    pub names: Vec<String>,
252    /// Action to take (e.g. "SCMP_ACT_ALLOW")
253    pub action: String,
254    /// Optional argument conditions
255    #[serde(default, skip_serializing_if = "Vec::is_empty")]
256    pub args: Vec<OciSeccompArg>,
257}
258
259/// Seccomp syscall argument filter.
260#[derive(Debug, Clone, Serialize, Deserialize)]
261pub struct OciSeccompArg {
262    /// Argument index (0-based)
263    pub index: u32,
264    /// Value to compare against
265    pub value: u64,
266    /// Second value for masked operations
267    #[serde(rename = "valueTwo", default, skip_serializing_if = "is_zero")]
268    pub value_two: u64,
269    /// Comparison operator (e.g. "SCMP_CMP_EQ", "SCMP_CMP_MASKED_EQ")
270    pub op: String,
271}
272
273fn is_zero(v: &u64) -> bool {
274    *v == 0
275}
276
277/// OCI Intel RDT (Resource Director Technology) configuration.
278///
279/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#intel-rdt>
280#[derive(Debug, Clone, Serialize, Deserialize)]
281pub struct OciIntelRdt {
282    /// Unique identity for the container's cache and memory bandwidth allocation
283    #[serde(rename = "closID", default, skip_serializing_if = "Option::is_none")]
284    pub clos_id: Option<String>,
285    /// Schema for L3 cache allocation
286    #[serde(
287        rename = "l3CacheSchema",
288        default,
289        skip_serializing_if = "Option::is_none"
290    )]
291    pub l3_cache_schema: Option<String>,
292    /// Schema for memory bandwidth allocation
293    #[serde(
294        rename = "memBwSchema",
295        default,
296        skip_serializing_if = "Option::is_none"
297    )]
298    pub mem_bw_schema: Option<String>,
299}
300
301/// A single OCI lifecycle hook entry.
302///
303/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config.md#posix-platform-hooks>
304#[derive(Debug, Clone, Serialize, Deserialize)]
305pub struct OciHook {
306    /// Absolute path to the hook binary.
307    pub path: String,
308    /// Arguments passed to the hook (argv\[0\] should be the binary name).
309    #[serde(default, skip_serializing_if = "Vec::is_empty")]
310    pub args: Vec<String>,
311    /// Environment variables for the hook process.
312    #[serde(default, skip_serializing_if = "Vec::is_empty")]
313    pub env: Vec<String>,
314    /// Timeout in seconds. If the hook does not exit within this duration it is killed.
315    #[serde(default, skip_serializing_if = "Option::is_none")]
316    pub timeout: Option<u32>,
317}
318
319/// OCI lifecycle hooks.
320///
321/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config.md#posix-platform-hooks>
322#[derive(Debug, Clone, Default, Serialize, Deserialize)]
323pub struct OciHooks {
324    /// Called after the runtime environment has been created but before pivot_root.
325    #[serde(
326        rename = "createRuntime",
327        default,
328        skip_serializing_if = "Vec::is_empty"
329    )]
330    pub create_runtime: Vec<OciHook>,
331    /// Called after pivot_root but before the start operation.
332    #[serde(
333        rename = "createContainer",
334        default,
335        skip_serializing_if = "Vec::is_empty"
336    )]
337    pub create_container: Vec<OciHook>,
338    /// Called after the start operation but before the user process executes.
339    #[serde(
340        rename = "startContainer",
341        default,
342        skip_serializing_if = "Vec::is_empty"
343    )]
344    pub start_container: Vec<OciHook>,
345    /// Called after the user-specified process has started.
346    #[serde(default, skip_serializing_if = "Vec::is_empty")]
347    pub poststart: Vec<OciHook>,
348    /// Called after the container has been stopped.
349    #[serde(default, skip_serializing_if = "Vec::is_empty")]
350    pub poststop: Vec<OciHook>,
351}
352
353/// Container state JSON passed to OCI hooks on stdin.
354///
355/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/runtime.md#state>
356#[derive(Debug, Clone, Serialize)]
357pub struct OciContainerState {
358    #[serde(rename = "ociVersion")]
359    pub oci_version: String,
360    pub id: String,
361    pub status: OciStatus,
362    pub pid: u32,
363    pub bundle: String,
364}
365
366impl OciHooks {
367    /// Returns true if there are no hooks configured.
368    pub fn is_empty(&self) -> bool {
369        self.create_runtime.is_empty()
370            && self.create_container.is_empty()
371            && self.start_container.is_empty()
372            && self.poststart.is_empty()
373            && self.poststop.is_empty()
374    }
375
376    /// Execute a list of hooks in order, passing container state JSON on stdin.
377    ///
378    /// If any hook exits non-zero, an error is returned immediately (remaining hooks are skipped).
379    pub fn run_hooks(hooks: &[OciHook], state: &OciContainerState, phase: &str) -> Result<()> {
380        let state_json = serde_json::to_string(state).map_err(|e| {
381            NucleusError::HookError(format!(
382                "Failed to serialize container state for hook: {}",
383                e
384            ))
385        })?;
386
387        for (i, hook) in hooks.iter().enumerate() {
388            info!(
389                "Running {} hook [{}/{}]: {}",
390                phase,
391                i + 1,
392                hooks.len(),
393                hook.path
394            );
395            Self::execute_hook(hook, &state_json, phase)?;
396        }
397
398        Ok(())
399    }
400
401    /// Execute a list of hooks best-effort (log errors but don't fail).
402    ///
403    /// Used for poststop hooks per the OCI spec: errors MUST be logged but MUST NOT
404    /// prevent cleanup.
405    pub fn run_hooks_best_effort(hooks: &[OciHook], state: &OciContainerState, phase: &str) {
406        let state_json = match serde_json::to_string(state) {
407            Ok(json) => json,
408            Err(e) => {
409                warn!(
410                    "Failed to serialize container state for {} hooks: {}",
411                    phase, e
412                );
413                return;
414            }
415        };
416
417        for (i, hook) in hooks.iter().enumerate() {
418            info!(
419                "Running {} hook [{}/{}]: {}",
420                phase,
421                i + 1,
422                hooks.len(),
423                hook.path
424            );
425            if let Err(e) = Self::execute_hook(hook, &state_json, phase) {
426                warn!("{} hook [{}] failed (continuing): {}", phase, i + 1, e);
427            }
428        }
429    }
430
431    fn execute_hook(hook: &OciHook, state_json: &str, phase: &str) -> Result<()> {
432        #[cfg(not(test))]
433        use std::os::unix::process::CommandExt;
434        use std::process::{Command, Stdio};
435
436        let hook_path = Path::new(&hook.path);
437        if !hook_path.is_absolute() {
438            return Err(NucleusError::HookError(format!(
439                "{} hook path must be absolute: {}",
440                phase, hook.path
441            )));
442        }
443        if !hook_path.exists() {
444            return Err(NucleusError::HookError(format!(
445                "{} hook binary not found: {}",
446                phase, hook.path
447            )));
448        }
449
450        // C-1: Validate hook binary ownership and permissions to prevent
451        // execution of world-writable or unexpectedly-owned binaries.
452        // Similar to runsc's hook validation — reject hooks that could be
453        // tampered with by unprivileged users.
454        Self::validate_hook_binary(hook_path, phase)?;
455
456        let mut cmd = Command::new(&hook.path);
457        if !hook.args.is_empty() {
458            // OCI spec: args[0] is the binary name (like execve argv); pass rest as arguments
459            cmd.args(&hook.args[1..]);
460        }
461
462        if !hook.env.is_empty() {
463            cmd.env_clear();
464            for entry in &hook.env {
465                if let Some((key, value)) = entry.split_once('=') {
466                    cmd.env(key, value);
467                }
468            }
469        }
470
471        // C-1: Drop all capabilities and set restrictive resource limits
472        // for hook execution. Hooks run in the parent process before security
473        // hardening, so we sandbox them defensively.
474        cmd.stdin(Stdio::piped());
475        cmd.stdout(Stdio::piped());
476        cmd.stderr(Stdio::piped());
477
478        // C-1: Apply RLIMIT backstops only in the spawned child process
479        // via pre_exec, so the parent process is not affected.
480        // Note: pre_exec runs after fork but before exec, in the child process.
481        #[cfg(not(test))]
482        unsafe {
483            cmd.pre_exec(|| {
484                let rlim_nproc = libc::rlimit {
485                    rlim_cur: 1024,
486                    rlim_max: 1024,
487                };
488                libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc);
489
490                let rlim_nofile = libc::rlimit {
491                    rlim_cur: 1024,
492                    rlim_max: 1024,
493                };
494                libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile);
495
496                Ok(())
497            });
498        }
499
500        let mut child = cmd.spawn().map_err(|e| {
501            NucleusError::HookError(format!(
502                "Failed to spawn {} hook {}: {}",
503                phase, hook.path, e
504            ))
505        })?;
506
507        if let Some(mut stdin) = child.stdin.take() {
508            use std::io::Write as IoWrite;
509            let _ = stdin.write_all(state_json.as_bytes());
510        }
511
512        let timeout_secs = hook.timeout.unwrap_or(30) as u64;
513        let start = std::time::Instant::now();
514        let timeout = std::time::Duration::from_secs(timeout_secs);
515
516        loop {
517            match child.try_wait() {
518                Ok(Some(status)) => {
519                    if status.success() {
520                        debug!("{} hook {} completed successfully", phase, hook.path);
521                        return Ok(());
522                    } else {
523                        let stderr = child
524                            .stderr
525                            .take()
526                            .map(|mut e| {
527                                let mut buf = String::new();
528                                use std::io::Read;
529                                let _ = e.read_to_string(&mut buf);
530                                buf
531                            })
532                            .unwrap_or_default();
533                        return Err(NucleusError::HookError(format!(
534                            "{} hook {} exited with status: {}{}",
535                            phase,
536                            hook.path,
537                            status,
538                            if stderr.is_empty() {
539                                String::new()
540                            } else {
541                                format!(" (stderr: {})", stderr.trim())
542                            }
543                        )));
544                    }
545                }
546                Ok(None) => {
547                    if start.elapsed() >= timeout {
548                        let _ = child.kill();
549                        let _ = child.wait();
550                        return Err(NucleusError::HookError(format!(
551                            "{} hook {} timed out after {}s",
552                            phase, hook.path, timeout_secs
553                        )));
554                    }
555                    std::thread::sleep(std::time::Duration::from_millis(50));
556                }
557                Err(e) => {
558                    return Err(NucleusError::HookError(format!(
559                        "Failed to wait for {} hook {}: {}",
560                        phase, hook.path, e
561                    )));
562                }
563            }
564        }
565    }
566
567    /// Validate hook binary ownership and permissions.
568    ///
569    /// Rejects hooks that are world-writable or group-writable, or owned by
570    /// a UID that doesn't match the effective UID or root. This prevents
571    /// privilege escalation via tampered hook binaries.
572    fn validate_hook_binary(hook_path: &Path, phase: &str) -> Result<()> {
573        let metadata = std::fs::metadata(hook_path).map_err(|e| {
574            NucleusError::HookError(format!(
575                "Failed to stat {} hook {}: {}",
576                phase,
577                hook_path.display(),
578                e
579            ))
580        })?;
581
582        use std::os::unix::fs::MetadataExt;
583        let mode = metadata.mode();
584        let uid = metadata.uid();
585        let gid = metadata.gid();
586        let effective_uid = nix::unistd::Uid::effective().as_raw();
587
588        // Reject world-writable hooks
589        if mode & 0o002 != 0 {
590            return Err(NucleusError::HookError(format!(
591                "{} hook {} is world-writable (mode {:04o}) — refusing to execute",
592                phase,
593                hook_path.display(),
594                mode & 0o7777
595            )));
596        }
597
598        // Reject group-writable hooks unless owned by root
599        if mode & 0o020 != 0 && uid != 0 {
600            return Err(NucleusError::HookError(format!(
601                "{} hook {} is group-writable and not owned by root (mode {:04o}, uid {}) — refusing to execute",
602                phase,
603                hook_path.display(),
604                mode & 0o7777,
605                uid
606            )));
607        }
608
609        // Reject hooks owned by arbitrary UIDs — must be root or effective UID
610        if uid != 0 && uid != effective_uid {
611            return Err(NucleusError::HookError(format!(
612                "{} hook {} is owned by UID {} (expected 0 or {}) — refusing to execute",
613                phase,
614                hook_path.display(),
615                uid,
616                effective_uid
617            )));
618        }
619
620        // Reject hooks with setuid/setgid bits
621        if mode & 0o6000 != 0 {
622            return Err(NucleusError::HookError(format!(
623                "{} hook {} has setuid/setgid bits (mode {:04o}) — refusing to execute",
624                phase,
625                hook_path.display(),
626                mode & 0o7777
627            )));
628        }
629
630        debug!(
631            "{} hook {} validation passed (uid={}, gid={}, mode={:04o})",
632            phase,
633            hook_path.display(),
634            uid,
635            gid,
636            mode & 0o7777
637        );
638
639        Ok(())
640    }
641}
642
643impl OciConfig {
644    /// Create a minimal OCI config for Nucleus containers
645    pub fn new(command: Vec<String>, hostname: Option<String>) -> Self {
646        Self {
647            oci_version: "1.0.2".to_string(),
648            root: OciRoot {
649                path: "rootfs".to_string(),
650                readonly: true,
651            },
652            process: OciProcess {
653                terminal: false,
654                user: OciUser {
655                    uid: 0,
656                    gid: 0,
657                    additional_gids: None,
658                },
659                args: command,
660                env: vec![
661                    "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(),
662                ],
663                cwd: "/".to_string(),
664                no_new_privileges: true,
665                capabilities: Some(OciCapabilities {
666                    bounding: vec![],
667                    effective: vec![],
668                    inheritable: vec![],
669                    permitted: vec![],
670                    ambient: vec![],
671                }),
672                rlimits: vec![],
673                console_size: None,
674                apparmor_profile: None,
675                selinux_label: None,
676            },
677            hostname,
678            mounts: vec![
679                OciMount {
680                    destination: "/proc".to_string(),
681                    source: "proc".to_string(),
682                    mount_type: "proc".to_string(),
683                    options: vec![
684                        "nosuid".to_string(),
685                        "noexec".to_string(),
686                        "nodev".to_string(),
687                    ],
688                },
689                OciMount {
690                    destination: "/dev".to_string(),
691                    source: "tmpfs".to_string(),
692                    mount_type: "tmpfs".to_string(),
693                    options: vec![
694                        "nosuid".to_string(),
695                        "noexec".to_string(),
696                        "strictatime".to_string(),
697                        "mode=755".to_string(),
698                        "size=65536k".to_string(),
699                    ],
700                },
701                OciMount {
702                    destination: "/tmp".to_string(),
703                    source: "tmpfs".to_string(),
704                    mount_type: "tmpfs".to_string(),
705                    options: vec![
706                        "nosuid".to_string(),
707                        "nodev".to_string(),
708                        "noexec".to_string(),
709                        "mode=1777".to_string(),
710                        "size=65536k".to_string(),
711                    ],
712                },
713                OciMount {
714                    destination: "/sys".to_string(),
715                    source: "sysfs".to_string(),
716                    mount_type: "sysfs".to_string(),
717                    options: vec![
718                        "nosuid".to_string(),
719                        "noexec".to_string(),
720                        "nodev".to_string(),
721                        "ro".to_string(),
722                    ],
723                },
724            ],
725            hooks: None,
726            annotations: HashMap::new(),
727            linux: Some(OciLinux {
728                namespaces: Some(vec![
729                    OciNamespace {
730                        namespace_type: "pid".to_string(),
731                    },
732                    OciNamespace {
733                        namespace_type: "network".to_string(),
734                    },
735                    OciNamespace {
736                        namespace_type: "ipc".to_string(),
737                    },
738                    OciNamespace {
739                        namespace_type: "uts".to_string(),
740                    },
741                    OciNamespace {
742                        namespace_type: "mount".to_string(),
743                    },
744                ]),
745                resources: None,
746                uid_mappings: vec![],
747                gid_mappings: vec![],
748                masked_paths: vec![
749                    "/proc/acpi".to_string(),
750                    "/proc/asound".to_string(),
751                    "/proc/kcore".to_string(),
752                    "/proc/keys".to_string(),
753                    "/proc/latency_stats".to_string(),
754                    "/proc/sched_debug".to_string(),
755                    "/proc/scsi".to_string(),
756                    "/proc/timer_list".to_string(),
757                    "/proc/timer_stats".to_string(),
758                    "/sys/firmware".to_string(),
759                ],
760                readonly_paths: vec![
761                    "/proc/bus".to_string(),
762                    "/proc/fs".to_string(),
763                    "/proc/irq".to_string(),
764                    "/proc/sys".to_string(),
765                    "/proc/sysrq-trigger".to_string(),
766                ],
767                devices: vec![
768                    OciDevice {
769                        device_type: "c".to_string(),
770                        path: "/dev/null".to_string(),
771                        major: Some(1),
772                        minor: Some(3),
773                        file_mode: Some(0o666),
774                        uid: Some(0),
775                        gid: Some(0),
776                    },
777                    OciDevice {
778                        device_type: "c".to_string(),
779                        path: "/dev/zero".to_string(),
780                        major: Some(1),
781                        minor: Some(5),
782                        file_mode: Some(0o666),
783                        uid: Some(0),
784                        gid: Some(0),
785                    },
786                    OciDevice {
787                        device_type: "c".to_string(),
788                        path: "/dev/full".to_string(),
789                        major: Some(1),
790                        minor: Some(7),
791                        file_mode: Some(0o666),
792                        uid: Some(0),
793                        gid: Some(0),
794                    },
795                    OciDevice {
796                        device_type: "c".to_string(),
797                        path: "/dev/random".to_string(),
798                        major: Some(1),
799                        minor: Some(8),
800                        file_mode: Some(0o666),
801                        uid: Some(0),
802                        gid: Some(0),
803                    },
804                    OciDevice {
805                        device_type: "c".to_string(),
806                        path: "/dev/urandom".to_string(),
807                        major: Some(1),
808                        minor: Some(9),
809                        file_mode: Some(0o666),
810                        uid: Some(0),
811                        gid: Some(0),
812                    },
813                ],
814                seccomp: None,
815                rootfs_propagation: Some("rprivate".to_string()),
816                sysctl: HashMap::new(),
817                cgroups_path: None,
818                intel_rdt: None,
819            }),
820        }
821    }
822
823    /// Add resource limits to the config
824    pub fn with_resources(mut self, limits: &ResourceLimits) -> Self {
825        let mut resources = OciResources {
826            memory: None,
827            cpu: None,
828            pids: None,
829        };
830
831        if let Some(memory_bytes) = limits.memory_bytes {
832            resources.memory = Some(OciMemory {
833                limit: Some(memory_bytes as i64),
834            });
835        }
836
837        if let Some(quota_us) = limits.cpu_quota_us {
838            resources.cpu = Some(OciCpu {
839                quota: Some(quota_us as i64),
840                period: Some(limits.cpu_period_us),
841            });
842        }
843
844        if let Some(pids_max) = limits.pids_max {
845            resources.pids = Some(OciPids {
846                limit: pids_max as i64,
847            });
848        }
849
850        if let Some(linux) = &mut self.linux {
851            linux.resources = Some(resources);
852        }
853
854        self
855    }
856
857    /// Add environment variables to the OCI process config.
858    pub fn with_env(mut self, vars: &[(String, String)]) -> Self {
859        for (key, value) in vars {
860            self.process.env.push(format!("{}={}", key, value));
861        }
862        self
863    }
864
865    /// Add sd_notify socket passthrough.
866    pub fn with_sd_notify(mut self) -> Self {
867        if let Ok(notify_socket) = std::env::var("NOTIFY_SOCKET") {
868            self.process
869                .env
870                .push(format!("NOTIFY_SOCKET={}", notify_socket));
871        }
872        self
873    }
874
875    /// Add bind mounts for secrets.
876    pub fn with_secret_mounts(mut self, secrets: &[crate::container::SecretMount]) -> Self {
877        for secret in secrets {
878            self.mounts.push(OciMount {
879                destination: secret.dest.to_string_lossy().to_string(),
880                source: secret.source.to_string_lossy().to_string(),
881                mount_type: "bind".to_string(),
882                options: vec![
883                    "bind".to_string(),
884                    "ro".to_string(),
885                    "nosuid".to_string(),
886                    "nodev".to_string(),
887                    "noexec".to_string(),
888                ],
889            });
890        }
891        self
892    }
893
894    /// Set the process identity for the OCI workload.
895    pub fn with_process_identity(mut self, identity: &crate::container::ProcessIdentity) -> Self {
896        self.process.user.uid = identity.uid;
897        self.process.user.gid = identity.gid;
898        self.process.user.additional_gids = if identity.additional_gids.is_empty() {
899            None
900        } else {
901            Some(identity.additional_gids.clone())
902        };
903        self
904    }
905
906    /// Add a read-only bind mount of an in-memory secret staging directory at
907    /// `/run/secrets`, plus compatibility bind mounts for each staged secret to
908    /// its requested container destination.
909    pub fn with_inmemory_secret_mounts(
910        mut self,
911        stage_dir: &Path,
912        secrets: &[crate::container::SecretMount],
913    ) -> Result<Self> {
914        self.mounts.push(OciMount {
915            destination: "/run/secrets".to_string(),
916            source: stage_dir.to_string_lossy().to_string(),
917            mount_type: "bind".to_string(),
918            options: vec![
919                "bind".to_string(),
920                "ro".to_string(),
921                "nosuid".to_string(),
922                "nodev".to_string(),
923                "noexec".to_string(),
924            ],
925        });
926
927        for secret in secrets {
928            let dest = normalize_container_destination(&secret.dest)?;
929            if !secret.source.starts_with(stage_dir) {
930                return Err(NucleusError::ConfigError(format!(
931                    "Staged secret source {:?} must live under {:?}",
932                    secret.source, stage_dir
933                )));
934            }
935            self.mounts.push(OciMount {
936                destination: dest.to_string_lossy().to_string(),
937                source: secret.source.to_string_lossy().to_string(),
938                mount_type: "bind".to_string(),
939                options: vec![
940                    "bind".to_string(),
941                    "ro".to_string(),
942                    "nosuid".to_string(),
943                    "nodev".to_string(),
944                    "noexec".to_string(),
945                ],
946            });
947        }
948
949        Ok(self)
950    }
951
952    /// Add bind or tmpfs volume mounts.
953    pub fn with_volume_mounts(mut self, volumes: &[crate::container::VolumeMount]) -> Result<Self> {
954        use crate::container::VolumeSource;
955
956        for volume in volumes {
957            let dest = normalize_container_destination(&volume.dest)?;
958            match &volume.source {
959                VolumeSource::Bind { source } => {
960                    let mut options = vec![
961                        "bind".to_string(),
962                        "nosuid".to_string(),
963                        "nodev".to_string(),
964                    ];
965                    if volume.read_only {
966                        options.push("ro".to_string());
967                    }
968                    self.mounts.push(OciMount {
969                        destination: dest.to_string_lossy().to_string(),
970                        source: source.to_string_lossy().to_string(),
971                        mount_type: "bind".to_string(),
972                        options,
973                    });
974                }
975                VolumeSource::Tmpfs { size } => {
976                    let mut options = vec![
977                        "nosuid".to_string(),
978                        "nodev".to_string(),
979                        "mode=0755".to_string(),
980                    ];
981                    if volume.read_only {
982                        options.push("ro".to_string());
983                    }
984                    if let Some(size) = size {
985                        options.push(format!("size={}", size));
986                    }
987                    self.mounts.push(OciMount {
988                        destination: dest.to_string_lossy().to_string(),
989                        source: "tmpfs".to_string(),
990                        mount_type: "tmpfs".to_string(),
991                        options,
992                    });
993                }
994            }
995        }
996
997        Ok(self)
998    }
999
1000    /// Bind mount the host context directory into the container.
1001    ///
1002    /// The gVisor integration path expects `/context` to be writable so test
1003    /// workloads can write results back to the host.
1004    pub fn with_context_bind(mut self, context_dir: &std::path::Path) -> Self {
1005        self.mounts.push(OciMount {
1006            destination: "/context".to_string(),
1007            source: context_dir.to_string_lossy().to_string(),
1008            mount_type: "bind".to_string(),
1009            options: vec![
1010                "bind".to_string(),
1011                "ro".to_string(),
1012                "nosuid".to_string(),
1013                "nodev".to_string(),
1014            ],
1015        });
1016        self
1017    }
1018
1019    /// Add rootfs bind mounts from a pre-built rootfs path.
1020    pub fn with_rootfs_binds(mut self, rootfs_path: &std::path::Path) -> Self {
1021        let subdirs = ["bin", "sbin", "lib", "lib64", "usr", "etc", "nix"];
1022        for subdir in &subdirs {
1023            let source = rootfs_path.join(subdir);
1024            if source.exists() {
1025                self.mounts.push(OciMount {
1026                    destination: format!("/{}", subdir),
1027                    source: source.to_string_lossy().to_string(),
1028                    mount_type: "bind".to_string(),
1029                    options: vec![
1030                        "bind".to_string(),
1031                        "ro".to_string(),
1032                        "nosuid".to_string(),
1033                        "nodev".to_string(),
1034                    ],
1035                });
1036            }
1037        }
1038        self
1039    }
1040
1041    /// Replace the default namespace list with an explicit configuration.
1042    pub fn with_namespace_config(mut self, config: &NamespaceConfig) -> Self {
1043        let mut namespaces = Vec::new();
1044
1045        if config.pid {
1046            namespaces.push(OciNamespace {
1047                namespace_type: "pid".to_string(),
1048            });
1049        }
1050        if config.net {
1051            namespaces.push(OciNamespace {
1052                namespace_type: "network".to_string(),
1053            });
1054        }
1055        if config.ipc {
1056            namespaces.push(OciNamespace {
1057                namespace_type: "ipc".to_string(),
1058            });
1059        }
1060        if config.uts {
1061            namespaces.push(OciNamespace {
1062                namespace_type: "uts".to_string(),
1063            });
1064        }
1065        if config.mnt {
1066            namespaces.push(OciNamespace {
1067                namespace_type: "mount".to_string(),
1068            });
1069        }
1070        if config.cgroup {
1071            namespaces.push(OciNamespace {
1072                namespace_type: "cgroup".to_string(),
1073            });
1074        }
1075        if config.time {
1076            namespaces.push(OciNamespace {
1077                namespace_type: "time".to_string(),
1078            });
1079        }
1080        if config.user {
1081            namespaces.push(OciNamespace {
1082                namespace_type: "user".to_string(),
1083            });
1084        }
1085
1086        if let Some(linux) = &mut self.linux {
1087            linux.namespaces = Some(namespaces);
1088        }
1089
1090        self
1091    }
1092
1093    /// Add read-only bind mounts for host runtime paths.
1094    ///
1095    /// This mirrors the native fallback path for non-production containers so
1096    /// common executables such as `/bin/sh` remain available inside the OCI
1097    /// rootfs when no explicit rootfs is configured.
1098    pub fn with_host_runtime_binds(mut self) -> Self {
1099        // Use a fixed set of standard FHS paths only. Do NOT scan host $PATH,
1100        // which would expose arbitrary host directories inside the container.
1101        let host_paths: BTreeSet<String> =
1102            ["/bin", "/sbin", "/usr", "/lib", "/lib64", "/nix/store"]
1103                .iter()
1104                .map(|s| s.to_string())
1105                .collect();
1106
1107        for host_path in host_paths {
1108            let source = Path::new(&host_path);
1109            if !source.exists() {
1110                continue;
1111            }
1112
1113            self.mounts.push(OciMount {
1114                destination: host_path.clone(),
1115                source: source.to_string_lossy().to_string(),
1116                mount_type: "bind".to_string(),
1117                options: vec![
1118                    "bind".to_string(),
1119                    "ro".to_string(),
1120                    "nosuid".to_string(),
1121                    "nodev".to_string(),
1122                ],
1123            });
1124        }
1125        self
1126    }
1127
1128    /// Add user namespace configuration
1129    pub fn with_user_namespace(mut self) -> Self {
1130        if let Some(linux) = &mut self.linux {
1131            if let Some(namespaces) = &mut linux.namespaces {
1132                namespaces.push(OciNamespace {
1133                    namespace_type: "user".to_string(),
1134                });
1135            }
1136        }
1137        self
1138    }
1139
1140    /// Configure gVisor's true rootless OCI path.
1141    ///
1142    /// gVisor expects UID/GID mappings in the OCI spec for this mode, and its
1143    /// rootless OCI implementation does not currently support a network
1144    /// namespace entry in the spec. We still control networking through
1145    /// runsc's top-level `--network` flag.
1146    pub fn with_rootless_user_namespace(mut self, config: &UserNamespaceConfig) -> Self {
1147        if let Some(linux) = &mut self.linux {
1148            if let Some(namespaces) = &mut linux.namespaces {
1149                namespaces.retain(|ns| ns.namespace_type != "network");
1150                if !namespaces.iter().any(|ns| ns.namespace_type == "user") {
1151                    namespaces.push(OciNamespace {
1152                        namespace_type: "user".to_string(),
1153                    });
1154                }
1155            }
1156            linux.uid_mappings = config.uid_mappings.iter().map(OciIdMapping::from).collect();
1157            linux.gid_mappings = config.gid_mappings.iter().map(OciIdMapping::from).collect();
1158        }
1159        self
1160    }
1161
1162    /// Set OCI lifecycle hooks on the config.
1163    pub fn with_hooks(mut self, hooks: OciHooks) -> Self {
1164        if hooks.is_empty() {
1165            self.hooks = None;
1166        } else {
1167            self.hooks = Some(hooks);
1168        }
1169        self
1170    }
1171
1172    /// Set process rlimits from the hardcoded Nucleus defaults.
1173    ///
1174    /// Mirrors the RLIMIT backstops applied in-process for native containers
1175    /// (runtime.rs), expressed as OCI config so gVisor can enforce them.
1176    pub fn with_rlimits(mut self, pids_max: Option<u64>) -> Self {
1177        let nproc_limit = pids_max.unwrap_or(512);
1178        self.process.rlimits = vec![
1179            OciRlimit {
1180                limit_type: "RLIMIT_NPROC".to_string(),
1181                hard: nproc_limit,
1182                soft: nproc_limit,
1183            },
1184            OciRlimit {
1185                limit_type: "RLIMIT_NOFILE".to_string(),
1186                hard: 1024,
1187                soft: 1024,
1188            },
1189            OciRlimit {
1190                limit_type: "RLIMIT_MEMLOCK".to_string(),
1191                hard: 64 * 1024,
1192                soft: 64 * 1024,
1193            },
1194        ];
1195        self
1196    }
1197
1198    /// Set the linux.seccomp section from an OCI seccomp config.
1199    pub fn with_seccomp(mut self, seccomp: OciSeccomp) -> Self {
1200        if let Some(linux) = &mut self.linux {
1201            linux.seccomp = Some(seccomp);
1202        }
1203        self
1204    }
1205
1206    /// Set the linux.cgroupsPath field.
1207    pub fn with_cgroups_path(mut self, path: String) -> Self {
1208        if let Some(linux) = &mut self.linux {
1209            linux.cgroups_path = Some(path);
1210        }
1211        self
1212    }
1213
1214    /// Set sysctl key-value pairs on the linux config.
1215    pub fn with_sysctl(mut self, sysctl: HashMap<String, String>) -> Self {
1216        if let Some(linux) = &mut self.linux {
1217            linux.sysctl = sysctl;
1218        }
1219        self
1220    }
1221
1222    /// Set annotations on the OCI config.
1223    pub fn with_annotations(mut self, annotations: HashMap<String, String>) -> Self {
1224        self.annotations = annotations;
1225        self
1226    }
1227}
1228
1229impl From<&IdMapping> for OciIdMapping {
1230    fn from(mapping: &IdMapping) -> Self {
1231        Self {
1232            container_id: mapping.container_id,
1233            host_id: mapping.host_id,
1234            size: mapping.count,
1235        }
1236    }
1237}
1238
1239/// OCI Bundle manager
1240///
1241/// Creates and manages OCI-compliant bundles for gVisor
1242pub struct OciBundle {
1243    bundle_path: PathBuf,
1244    config: OciConfig,
1245}
1246
1247impl OciBundle {
1248    /// Create a new OCI bundle
1249    pub fn new(bundle_path: PathBuf, config: OciConfig) -> Self {
1250        Self {
1251            bundle_path,
1252            config,
1253        }
1254    }
1255
1256    /// Create the bundle directory structure and write config.json
1257    pub fn create(&self) -> Result<()> {
1258        info!("Creating OCI bundle at {:?}", self.bundle_path);
1259
1260        // Create bundle directory
1261        fs::create_dir_all(&self.bundle_path).map_err(|e| {
1262            NucleusError::GVisorError(format!(
1263                "Failed to create bundle directory {:?}: {}",
1264                self.bundle_path, e
1265            ))
1266        })?;
1267        fs::set_permissions(&self.bundle_path, fs::Permissions::from_mode(0o700)).map_err(|e| {
1268            NucleusError::GVisorError(format!(
1269                "Failed to secure bundle directory permissions {:?}: {}",
1270                self.bundle_path, e
1271            ))
1272        })?;
1273
1274        // Create rootfs directory
1275        let rootfs = self.bundle_path.join("rootfs");
1276        fs::create_dir_all(&rootfs).map_err(|e| {
1277            NucleusError::GVisorError(format!("Failed to create rootfs directory: {}", e))
1278        })?;
1279        fs::set_permissions(&rootfs, fs::Permissions::from_mode(0o700)).map_err(|e| {
1280            NucleusError::GVisorError(format!(
1281                "Failed to secure rootfs directory permissions {:?}: {}",
1282                rootfs, e
1283            ))
1284        })?;
1285
1286        // Write config.json
1287        let config_path = self.bundle_path.join("config.json");
1288        let config_json = serde_json::to_string_pretty(&self.config).map_err(|e| {
1289            NucleusError::GVisorError(format!("Failed to serialize OCI config: {}", e))
1290        })?;
1291
1292        let mut file = OpenOptions::new()
1293            .create(true)
1294            .truncate(true)
1295            .write(true)
1296            .mode(0o600)
1297            .open(&config_path)
1298            .map_err(|e| NucleusError::GVisorError(format!("Failed to open config.json: {}", e)))?;
1299        file.write_all(config_json.as_bytes()).map_err(|e| {
1300            NucleusError::GVisorError(format!("Failed to write config.json: {}", e))
1301        })?;
1302        file.sync_all()
1303            .map_err(|e| NucleusError::GVisorError(format!("Failed to sync config.json: {}", e)))?;
1304
1305        debug!("Created OCI bundle structure at {:?}", self.bundle_path);
1306
1307        Ok(())
1308    }
1309
1310    /// Get the rootfs path
1311    pub fn rootfs_path(&self) -> PathBuf {
1312        self.bundle_path.join("rootfs")
1313    }
1314
1315    /// Get the bundle path
1316    pub fn bundle_path(&self) -> &Path {
1317        &self.bundle_path
1318    }
1319
1320    /// Clean up the bundle
1321    pub fn cleanup(&self) -> Result<()> {
1322        if self.bundle_path.exists() {
1323            fs::remove_dir_all(&self.bundle_path).map_err(|e| {
1324                NucleusError::GVisorError(format!("Failed to cleanup bundle: {}", e))
1325            })?;
1326            debug!("Cleaned up OCI bundle at {:?}", self.bundle_path);
1327        }
1328        Ok(())
1329    }
1330}
1331
1332#[cfg(test)]
1333mod tests {
1334    use super::*;
1335    use tempfile::TempDir;
1336
1337    #[test]
1338    fn test_oci_config_new() {
1339        let config = OciConfig::new(vec!["/bin/sh".to_string()], Some("test".to_string()));
1340
1341        assert_eq!(config.oci_version, "1.0.2");
1342        assert_eq!(config.root.path, "rootfs");
1343        assert_eq!(config.process.args, vec!["/bin/sh"]);
1344        assert_eq!(config.hostname, Some("test".to_string()));
1345    }
1346
1347    #[test]
1348    fn test_oci_config_with_resources() {
1349        let limits = ResourceLimits::unlimited()
1350            .with_memory("512M")
1351            .unwrap()
1352            .with_cpu_cores(2.0)
1353            .unwrap();
1354
1355        let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_resources(&limits);
1356
1357        assert!(config.linux.is_some());
1358        let linux = config.linux.unwrap();
1359        assert!(linux.resources.is_some());
1360
1361        let resources = linux.resources.unwrap();
1362        assert!(resources.memory.is_some());
1363        assert!(resources.cpu.is_some());
1364    }
1365
1366    #[test]
1367    fn test_oci_bundle_create() {
1368        let temp_dir = TempDir::new().unwrap();
1369        let bundle_path = temp_dir.path().join("test-bundle");
1370
1371        let config = OciConfig::new(vec!["/bin/sh".to_string()], None);
1372        let bundle = OciBundle::new(bundle_path.clone(), config);
1373
1374        bundle.create().unwrap();
1375
1376        assert!(bundle_path.exists());
1377        assert!(bundle_path.join("rootfs").exists());
1378        assert!(bundle_path.join("config.json").exists());
1379
1380        bundle.cleanup().unwrap();
1381        assert!(!bundle_path.exists());
1382    }
1383
1384    #[test]
1385    fn test_oci_config_serialization() {
1386        let config = OciConfig::new(vec!["/bin/sh".to_string()], Some("test".to_string()));
1387
1388        let json = serde_json::to_string_pretty(&config).unwrap();
1389        assert!(json.contains("ociVersion"));
1390        assert!(json.contains("1.0.2"));
1391        assert!(json.contains("/bin/sh"));
1392
1393        // Test deserialization
1394        let deserialized: OciConfig = serde_json::from_str(&json).unwrap();
1395        assert_eq!(deserialized.oci_version, config.oci_version);
1396        assert_eq!(deserialized.process.args, config.process.args);
1397    }
1398
1399    #[test]
1400    fn test_host_runtime_binds_uses_fixed_paths_not_host_path() {
1401        // with_host_runtime_binds must NOT scan the host $PATH. Only standard
1402        // FHS paths should be bind-mounted to prevent leaking arbitrary host
1403        // directories into the container. Verify by setting a distinctive PATH
1404        // and checking that none of its entries appear in the resulting mounts.
1405        std::env::set_var("PATH", "/tmp/evil-inject-path/bin:/opt/attacker/sbin");
1406        let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_host_runtime_binds();
1407        let mount_dests: Vec<&str> = config
1408            .mounts
1409            .iter()
1410            .map(|m| m.destination.as_str())
1411            .collect();
1412        let mount_srcs: Vec<&str> = config.mounts.iter().map(|m| m.source.as_str()).collect();
1413        // Verify no mount references the injected PATH entries
1414        for path in &["/tmp/evil-inject-path", "/opt/attacker"] {
1415            assert!(
1416                !mount_dests.iter().any(|d| d.contains(path)),
1417                "with_host_runtime_binds must not use host $PATH — found {:?} in mount destinations",
1418                path
1419            );
1420            assert!(
1421                !mount_srcs.iter().any(|s| s.contains(path)),
1422                "with_host_runtime_binds must not use host $PATH — found {:?} in mount sources",
1423                path
1424            );
1425        }
1426        // Verify only standard FHS paths are mounted
1427        let allowed_prefixes = ["/bin", "/sbin", "/usr", "/lib", "/lib64", "/nix/store"];
1428        for mount in &config.mounts {
1429            if mount.mount_type == "bind" {
1430                assert!(
1431                    allowed_prefixes
1432                        .iter()
1433                        .any(|p| mount.destination.starts_with(p)),
1434                    "unexpected bind mount destination: {} — only FHS paths allowed",
1435                    mount.destination
1436                );
1437            }
1438        }
1439    }
1440
1441    #[test]
1442    fn test_volume_mounts_include_bind_and_tmpfs_options() {
1443        let tmp = tempfile::TempDir::new().unwrap();
1444        let config = OciConfig::new(vec!["/bin/sh".to_string()], None)
1445            .with_volume_mounts(&[
1446                crate::container::VolumeMount {
1447                    source: crate::container::VolumeSource::Bind {
1448                        source: tmp.path().to_path_buf(),
1449                    },
1450                    dest: std::path::PathBuf::from("/var/lib/app"),
1451                    read_only: true,
1452                },
1453                crate::container::VolumeMount {
1454                    source: crate::container::VolumeSource::Tmpfs {
1455                        size: Some("64M".to_string()),
1456                    },
1457                    dest: std::path::PathBuf::from("/var/cache/app"),
1458                    read_only: false,
1459                },
1460            ])
1461            .unwrap();
1462
1463        assert!(config.mounts.iter().any(|mount| {
1464            mount.destination == "/var/lib/app"
1465                && mount.mount_type == "bind"
1466                && mount.options.contains(&"ro".to_string())
1467        }));
1468        assert!(config.mounts.iter().any(|mount| {
1469            mount.destination == "/var/cache/app"
1470                && mount.mount_type == "tmpfs"
1471                && mount.options.contains(&"size=64M".to_string())
1472        }));
1473    }
1474
1475    #[test]
1476    fn test_oci_config_with_process_identity() {
1477        let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_process_identity(
1478            &crate::container::ProcessIdentity {
1479                uid: 1001,
1480                gid: 1002,
1481                additional_gids: vec![1003, 1004],
1482            },
1483        );
1484
1485        assert_eq!(config.process.user.uid, 1001);
1486        assert_eq!(config.process.user.gid, 1002);
1487        assert_eq!(config.process.user.additional_gids, Some(vec![1003, 1004]));
1488    }
1489
1490    #[test]
1491    fn test_oci_config_uses_hardcoded_path_not_host() {
1492        // C-3: PATH must be a hardcoded minimal value, never the host's PATH.
1493        // This prevents leaking host filesystem layout into the container.
1494        std::env::set_var("PATH", "/nix/store/secret-hash/bin:/home/user/.local/bin");
1495        let config = OciConfig::new(vec!["/bin/sh".to_string()], None);
1496        let path_env = config
1497            .process
1498            .env
1499            .iter()
1500            .find(|e| e.starts_with("PATH="))
1501            .expect("PATH env must be set");
1502        assert_eq!(
1503            path_env, "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1504            "OCI config must not leak host PATH"
1505        );
1506        assert!(
1507            !path_env.contains("/nix/store/secret"),
1508            "Host PATH must not leak into container"
1509        );
1510    }
1511
1512    #[test]
1513    fn test_oci_hooks_serialization_roundtrip() {
1514        let hooks = OciHooks {
1515            create_runtime: vec![OciHook {
1516                path: "/usr/bin/hook1".to_string(),
1517                args: vec!["hook1".to_string(), "--arg1".to_string()],
1518                env: vec!["FOO=bar".to_string()],
1519                timeout: Some(10),
1520            }],
1521            create_container: vec![],
1522            start_container: vec![],
1523            poststart: vec![OciHook {
1524                path: "/usr/bin/hook2".to_string(),
1525                args: vec![],
1526                env: vec![],
1527                timeout: None,
1528            }],
1529            poststop: vec![],
1530        };
1531
1532        let json = serde_json::to_string_pretty(&hooks).unwrap();
1533        assert!(json.contains("createRuntime"));
1534        assert!(json.contains("/usr/bin/hook1"));
1535        assert!(!json.contains("createContainer")); // empty vecs are skipped
1536
1537        let deserialized: OciHooks = serde_json::from_str(&json).unwrap();
1538        assert_eq!(deserialized.create_runtime.len(), 1);
1539        assert_eq!(deserialized.create_runtime[0].path, "/usr/bin/hook1");
1540        assert_eq!(deserialized.create_runtime[0].timeout, Some(10));
1541        assert_eq!(deserialized.poststart.len(), 1);
1542        assert!(deserialized.create_container.is_empty());
1543    }
1544
1545    #[test]
1546    fn test_oci_hooks_is_empty() {
1547        let empty = OciHooks::default();
1548        assert!(empty.is_empty());
1549
1550        let not_empty = OciHooks {
1551            poststop: vec![OciHook {
1552                path: "/bin/cleanup".to_string(),
1553                args: vec![],
1554                env: vec![],
1555                timeout: None,
1556            }],
1557            ..Default::default()
1558        };
1559        assert!(!not_empty.is_empty());
1560    }
1561
1562    #[test]
1563    fn test_oci_config_with_hooks() {
1564        let hooks = OciHooks {
1565            create_runtime: vec![OciHook {
1566                path: "/usr/bin/setup".to_string(),
1567                args: vec![],
1568                env: vec![],
1569                timeout: None,
1570            }],
1571            ..Default::default()
1572        };
1573
1574        let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_hooks(hooks);
1575        assert!(config.hooks.is_some());
1576
1577        let json = serde_json::to_string_pretty(&config).unwrap();
1578        assert!(json.contains("hooks"));
1579        assert!(json.contains("createRuntime"));
1580
1581        let deserialized: OciConfig = serde_json::from_str(&json).unwrap();
1582        assert!(deserialized.hooks.is_some());
1583        assert_eq!(deserialized.hooks.unwrap().create_runtime.len(), 1);
1584    }
1585
1586    #[test]
1587    fn test_oci_config_with_empty_hooks_serializes_without_hooks() {
1588        let config =
1589            OciConfig::new(vec!["/bin/sh".to_string()], None).with_hooks(OciHooks::default());
1590        assert!(config.hooks.is_none()); // empty hooks are set to None
1591
1592        let json = serde_json::to_string_pretty(&config).unwrap();
1593        assert!(!json.contains("hooks"));
1594    }
1595
1596    #[test]
1597    fn test_oci_hook_rejects_relative_path() {
1598        let hook = OciHook {
1599            path: "relative/path".to_string(),
1600            args: vec![],
1601            env: vec![],
1602            timeout: None,
1603        };
1604        let state = OciContainerState {
1605            oci_version: "1.0.2".to_string(),
1606            id: "test".to_string(),
1607            status: OciStatus::Creating,
1608            pid: 1234,
1609            bundle: "/tmp/bundle".to_string(),
1610        };
1611        let result = OciHooks::run_hooks(&[hook], &state, "test");
1612        assert!(result.is_err());
1613        let err_msg = result.unwrap_err().to_string();
1614        assert!(err_msg.contains("absolute"), "error: {}", err_msg);
1615    }
1616
1617    /// Read the original PATH from /proc/self/environ.
1618    ///
1619    /// Other tests in this module call `std::env::set_var("PATH", ...)` which
1620    /// corrupts the process environment. /proc/self/environ is frozen at
1621    /// process startup so it always reflects the real PATH.
1622    fn original_path() -> String {
1623        if let Ok(environ) = std::fs::read("/proc/self/environ") {
1624            for entry in environ.split(|&b| b == 0) {
1625                if let Ok(s) = std::str::from_utf8(entry) {
1626                    if let Some(val) = s.strip_prefix("PATH=") {
1627                        return val.to_string();
1628                    }
1629                }
1630            }
1631        }
1632        String::new()
1633    }
1634
1635    /// Resolve the absolute path to bash for test scripts.
1636    fn find_bash() -> String {
1637        let candidates = ["/bin/bash", "/usr/bin/bash"];
1638        for c in &candidates {
1639            if std::path::Path::new(c).exists() {
1640                return c.to_string();
1641            }
1642        }
1643        for dir in original_path().split(':') {
1644            let candidate = std::path::PathBuf::from(dir).join("bash");
1645            if candidate.exists() {
1646                return candidate.to_string_lossy().to_string();
1647            }
1648        }
1649        panic!("Cannot find bash binary for test");
1650    }
1651
1652    /// Write a script file with proper shebang and ensure it's fully flushed before execution.
1653    /// Embeds the original PATH so scripts can find utilities like `cat`/`touch`
1654    /// even when other tests have corrupted the process PATH.
1655    fn write_script(path: &std::path::Path, body: &str) {
1656        use std::io::Write as IoWrite;
1657        let bash = find_bash();
1658        let orig_path = original_path();
1659        let content = format!("#!{}\nexport PATH='{}'\n{}", bash, orig_path, body);
1660        let mut f = OpenOptions::new()
1661            .create(true)
1662            .truncate(true)
1663            .write(true)
1664            .mode(0o755)
1665            .open(path)
1666            .unwrap();
1667        f.write_all(content.as_bytes()).unwrap();
1668        f.sync_all().unwrap();
1669        drop(f);
1670    }
1671
1672    #[test]
1673    fn test_oci_hook_executes_successfully() {
1674        let temp_dir = TempDir::new().unwrap();
1675        let hook_script = temp_dir.path().join("hook.sh");
1676        let output_file = temp_dir.path().join("output.json");
1677
1678        write_script(
1679            &hook_script,
1680            &format!("cat > {}\n", output_file.to_string_lossy()),
1681        );
1682
1683        let hook = OciHook {
1684            path: hook_script.to_string_lossy().to_string(),
1685            args: vec![],
1686            env: vec![],
1687            timeout: Some(5),
1688        };
1689        let state = OciContainerState {
1690            oci_version: "1.0.2".to_string(),
1691            id: "test-container".to_string(),
1692            status: OciStatus::Creating,
1693            pid: 12345,
1694            bundle: "/tmp/test-bundle".to_string(),
1695        };
1696
1697        OciHooks::run_hooks(&[hook], &state, "createRuntime").unwrap();
1698
1699        // Verify the hook received the container state JSON on stdin
1700        let written = std::fs::read_to_string(&output_file).unwrap();
1701        let parsed: serde_json::Value = serde_json::from_str(&written).unwrap();
1702        assert_eq!(parsed["id"], "test-container");
1703        assert_eq!(parsed["pid"], 12345);
1704        assert_eq!(parsed["status"], "creating");
1705    }
1706
1707    #[test]
1708    fn test_oci_hook_nonzero_exit_is_error() {
1709        let temp_dir = TempDir::new().unwrap();
1710        let hook_script = temp_dir.path().join("fail.sh");
1711        write_script(&hook_script, "exit 1\n");
1712
1713        let hook = OciHook {
1714            path: hook_script.to_string_lossy().to_string(),
1715            args: vec![],
1716            env: vec![],
1717            timeout: Some(5),
1718        };
1719        let state = OciContainerState {
1720            oci_version: "1.0.2".to_string(),
1721            id: "test".to_string(),
1722            status: OciStatus::Creating,
1723            pid: 1,
1724            bundle: "".to_string(),
1725        };
1726
1727        let result = OciHooks::run_hooks(&[hook], &state, "test");
1728        assert!(result.is_err());
1729        assert!(result
1730            .unwrap_err()
1731            .to_string()
1732            .contains("exited with status"));
1733    }
1734
1735    #[test]
1736    fn test_oci_hooks_best_effort_continues_on_failure() {
1737        let temp_dir = TempDir::new().unwrap();
1738        let fail_script = temp_dir.path().join("fail.sh");
1739        write_script(&fail_script, "exit 1\n");
1740
1741        let marker = temp_dir.path().join("ran");
1742        let ok_script = temp_dir.path().join("ok.sh");
1743        write_script(&ok_script, &format!("touch {}\n", marker.to_string_lossy()));
1744
1745        let hooks = vec![
1746            OciHook {
1747                path: fail_script.to_string_lossy().to_string(),
1748                args: vec![],
1749                env: vec![],
1750                timeout: Some(5),
1751            },
1752            OciHook {
1753                path: ok_script.to_string_lossy().to_string(),
1754                args: vec![],
1755                env: vec![],
1756                timeout: Some(5),
1757            },
1758        ];
1759        let state = OciContainerState {
1760            oci_version: "1.0.2".to_string(),
1761            id: "test".to_string(),
1762            status: OciStatus::Stopped,
1763            pid: 0,
1764            bundle: "".to_string(),
1765        };
1766
1767        // best_effort should not panic or return error
1768        OciHooks::run_hooks_best_effort(&hooks, &state, "poststop");
1769        // Second hook should have run despite first failing
1770        assert!(marker.exists(), "second hook should run after first fails");
1771    }
1772}