Skip to main content

nucleus/oci/
mod.rs

1use crate::container::OciStatus;
2use crate::error::{NucleusError, Result};
3use crate::filesystem::{normalize_container_destination, normalize_volume_destination};
4use crate::isolation::{IdMapping, NamespaceConfig, UserNamespaceConfig};
5use crate::resources::ResourceLimits;
6use serde::{Deserialize, Serialize};
7use std::collections::{BTreeSet, HashMap};
8use std::ffi::CString;
9use std::fs;
10use std::fs::OpenOptions;
11use std::io::Write;
12use std::os::fd::{AsRawFd, FromRawFd};
13use std::os::unix::fs::{OpenOptionsExt, PermissionsExt};
14use std::path::{Path, PathBuf};
15use tracing::{debug, info, warn};
16
17/// OCI Runtime Specification configuration
18///
19/// This implements a subset of the OCI runtime spec for gVisor compatibility
20/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config.md>
21#[derive(Debug, Clone, Serialize, Deserialize)]
22pub struct OciConfig {
23    #[serde(rename = "ociVersion")]
24    pub oci_version: String,
25
26    pub root: OciRoot,
27    pub process: OciProcess,
28    pub hostname: Option<String>,
29    pub mounts: Vec<OciMount>,
30    pub linux: Option<OciLinux>,
31    #[serde(default, skip_serializing_if = "Option::is_none")]
32    pub hooks: Option<OciHooks>,
33    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
34    pub annotations: HashMap<String, String>,
35}
36
37#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct OciRoot {
39    pub path: String,
40    pub readonly: bool,
41}
42
43#[derive(Debug, Clone, Serialize, Deserialize)]
44pub struct OciProcess {
45    pub terminal: bool,
46    pub user: OciUser,
47    pub args: Vec<String>,
48    pub env: Vec<String>,
49    pub cwd: String,
50    #[serde(rename = "noNewPrivileges")]
51    pub no_new_privileges: bool,
52    pub capabilities: Option<OciCapabilities>,
53    #[serde(default, skip_serializing_if = "Vec::is_empty")]
54    pub rlimits: Vec<OciRlimit>,
55    #[serde(
56        rename = "consoleSize",
57        default,
58        skip_serializing_if = "Option::is_none"
59    )]
60    pub console_size: Option<OciConsoleSize>,
61    #[serde(
62        rename = "apparmorProfile",
63        default,
64        skip_serializing_if = "Option::is_none"
65    )]
66    pub apparmor_profile: Option<String>,
67    #[serde(
68        rename = "selinuxLabel",
69        default,
70        skip_serializing_if = "Option::is_none"
71    )]
72    pub selinux_label: Option<String>,
73}
74
75#[derive(Debug, Clone, Serialize, Deserialize)]
76pub struct OciUser {
77    pub uid: u32,
78    pub gid: u32,
79    #[serde(skip_serializing_if = "Option::is_none")]
80    pub additional_gids: Option<Vec<u32>>,
81}
82
83#[derive(Debug, Clone, Serialize, Deserialize)]
84pub struct OciCapabilities {
85    pub bounding: Vec<String>,
86    pub effective: Vec<String>,
87    pub inheritable: Vec<String>,
88    pub permitted: Vec<String>,
89    pub ambient: Vec<String>,
90}
91
92#[derive(Debug, Clone, Serialize, Deserialize)]
93pub struct OciMount {
94    pub destination: String,
95    pub source: String,
96    #[serde(rename = "type")]
97    pub mount_type: String,
98    pub options: Vec<String>,
99}
100
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct OciLinux {
103    #[serde(skip_serializing_if = "Option::is_none")]
104    pub namespaces: Option<Vec<OciNamespace>>,
105    #[serde(skip_serializing_if = "Option::is_none")]
106    pub resources: Option<OciResources>,
107    #[serde(rename = "uidMappings", skip_serializing_if = "Vec::is_empty", default)]
108    pub uid_mappings: Vec<OciIdMapping>,
109    #[serde(rename = "gidMappings", skip_serializing_if = "Vec::is_empty", default)]
110    pub gid_mappings: Vec<OciIdMapping>,
111    #[serde(rename = "maskedPaths", skip_serializing_if = "Vec::is_empty", default)]
112    pub masked_paths: Vec<String>,
113    #[serde(
114        rename = "readonlyPaths",
115        skip_serializing_if = "Vec::is_empty",
116        default
117    )]
118    pub readonly_paths: Vec<String>,
119    #[serde(default, skip_serializing_if = "Vec::is_empty")]
120    pub devices: Vec<OciDevice>,
121    #[serde(default, skip_serializing_if = "Option::is_none")]
122    pub seccomp: Option<OciSeccomp>,
123    #[serde(
124        rename = "rootfsPropagation",
125        default,
126        skip_serializing_if = "Option::is_none"
127    )]
128    pub rootfs_propagation: Option<String>,
129    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
130    pub sysctl: HashMap<String, String>,
131    #[serde(
132        rename = "cgroupsPath",
133        default,
134        skip_serializing_if = "Option::is_none"
135    )]
136    pub cgroups_path: Option<String>,
137    #[serde(rename = "intelRdt", default, skip_serializing_if = "Option::is_none")]
138    pub intel_rdt: Option<OciIntelRdt>,
139}
140
141#[derive(Debug, Clone, Serialize, Deserialize)]
142pub struct OciNamespace {
143    #[serde(rename = "type")]
144    pub namespace_type: String,
145}
146
147#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
148pub struct OciIdMapping {
149    #[serde(rename = "containerID")]
150    pub container_id: u32,
151    #[serde(rename = "hostID")]
152    pub host_id: u32,
153    pub size: u32,
154}
155
156#[derive(Debug, Clone, Serialize, Deserialize)]
157pub struct OciResources {
158    #[serde(skip_serializing_if = "Option::is_none")]
159    pub memory: Option<OciMemory>,
160    #[serde(skip_serializing_if = "Option::is_none")]
161    pub cpu: Option<OciCpu>,
162    #[serde(skip_serializing_if = "Option::is_none")]
163    pub pids: Option<OciPids>,
164}
165
166#[derive(Debug, Clone, Serialize, Deserialize)]
167pub struct OciMemory {
168    #[serde(skip_serializing_if = "Option::is_none")]
169    pub limit: Option<i64>,
170}
171
172#[derive(Debug, Clone, Serialize, Deserialize)]
173pub struct OciCpu {
174    #[serde(skip_serializing_if = "Option::is_none")]
175    pub quota: Option<i64>,
176    #[serde(skip_serializing_if = "Option::is_none")]
177    pub period: Option<u64>,
178}
179
180#[derive(Debug, Clone, Serialize, Deserialize)]
181pub struct OciPids {
182    pub limit: i64,
183}
184
185/// OCI process resource limit.
186///
187/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config.md#posix-process>
188#[derive(Debug, Clone, Serialize, Deserialize)]
189pub struct OciRlimit {
190    /// Resource type (e.g. "RLIMIT_NOFILE", "RLIMIT_NPROC")
191    #[serde(rename = "type")]
192    pub limit_type: String,
193    /// Hard limit
194    pub hard: u64,
195    /// Soft limit
196    pub soft: u64,
197}
198
199/// OCI console size for terminal-attached processes.
200#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct OciConsoleSize {
202    pub height: u32,
203    pub width: u32,
204}
205
206/// OCI linux device entry.
207///
208/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#devices>
209#[derive(Debug, Clone, Serialize, Deserialize)]
210pub struct OciDevice {
211    /// Device type: "c" (char), "b" (block), "u" (unbuffered), "p" (FIFO)
212    #[serde(rename = "type")]
213    pub device_type: String,
214    /// Device path inside the container
215    pub path: String,
216    /// Major number
217    #[serde(skip_serializing_if = "Option::is_none")]
218    pub major: Option<i64>,
219    /// Minor number
220    #[serde(skip_serializing_if = "Option::is_none")]
221    pub minor: Option<i64>,
222    /// File mode (permissions)
223    #[serde(rename = "fileMode", skip_serializing_if = "Option::is_none")]
224    pub file_mode: Option<u32>,
225    /// UID of the device owner
226    #[serde(skip_serializing_if = "Option::is_none")]
227    pub uid: Option<u32>,
228    /// GID of the device owner
229    #[serde(skip_serializing_if = "Option::is_none")]
230    pub gid: Option<u32>,
231}
232
233/// OCI seccomp configuration.
234///
235/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#seccomp>
236#[derive(Debug, Clone, Serialize, Deserialize)]
237pub struct OciSeccomp {
238    /// Default action when no rule matches (e.g. "SCMP_ACT_ERRNO", "SCMP_ACT_ALLOW")
239    #[serde(rename = "defaultAction")]
240    pub default_action: String,
241    /// Target architectures
242    #[serde(default, skip_serializing_if = "Vec::is_empty")]
243    pub architectures: Vec<String>,
244    /// Syscall rules
245    #[serde(default, skip_serializing_if = "Vec::is_empty")]
246    pub syscalls: Vec<OciSeccompSyscall>,
247}
248
249/// A single seccomp syscall rule.
250#[derive(Debug, Clone, Serialize, Deserialize)]
251pub struct OciSeccompSyscall {
252    /// Syscall names this rule applies to
253    pub names: Vec<String>,
254    /// Action to take (e.g. "SCMP_ACT_ALLOW")
255    pub action: String,
256    /// Optional argument conditions
257    #[serde(default, skip_serializing_if = "Vec::is_empty")]
258    pub args: Vec<OciSeccompArg>,
259}
260
261/// Seccomp syscall argument filter.
262#[derive(Debug, Clone, Serialize, Deserialize)]
263pub struct OciSeccompArg {
264    /// Argument index (0-based)
265    pub index: u32,
266    /// Value to compare against
267    pub value: u64,
268    /// Second value for masked operations
269    #[serde(rename = "valueTwo", default, skip_serializing_if = "is_zero")]
270    pub value_two: u64,
271    /// Comparison operator (e.g. "SCMP_CMP_EQ", "SCMP_CMP_MASKED_EQ")
272    pub op: String,
273}
274
275fn is_zero(v: &u64) -> bool {
276    *v == 0
277}
278
279/// OCI Intel RDT (Resource Director Technology) configuration.
280///
281/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#intel-rdt>
282#[derive(Debug, Clone, Serialize, Deserialize)]
283pub struct OciIntelRdt {
284    /// Unique identity for the container's cache and memory bandwidth allocation
285    #[serde(rename = "closID", default, skip_serializing_if = "Option::is_none")]
286    pub clos_id: Option<String>,
287    /// Schema for L3 cache allocation
288    #[serde(
289        rename = "l3CacheSchema",
290        default,
291        skip_serializing_if = "Option::is_none"
292    )]
293    pub l3_cache_schema: Option<String>,
294    /// Schema for memory bandwidth allocation
295    #[serde(
296        rename = "memBwSchema",
297        default,
298        skip_serializing_if = "Option::is_none"
299    )]
300    pub mem_bw_schema: Option<String>,
301}
302
303/// A single OCI lifecycle hook entry.
304///
305/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config.md#posix-platform-hooks>
306#[derive(Debug, Clone, Serialize, Deserialize)]
307pub struct OciHook {
308    /// Absolute path to the hook binary.
309    pub path: String,
310    /// Arguments passed to the hook (argv\[0\] should be the binary name).
311    #[serde(default, skip_serializing_if = "Vec::is_empty")]
312    pub args: Vec<String>,
313    /// Environment variables for the hook process.
314    #[serde(default, skip_serializing_if = "Vec::is_empty")]
315    pub env: Vec<String>,
316    /// Timeout in seconds. If the hook does not exit within this duration it is killed.
317    #[serde(default, skip_serializing_if = "Option::is_none")]
318    pub timeout: Option<u32>,
319}
320
321/// OCI lifecycle hooks.
322///
323/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/config.md#posix-platform-hooks>
324#[derive(Debug, Clone, Default, Serialize, Deserialize)]
325pub struct OciHooks {
326    /// Called after the runtime environment has been created but before pivot_root.
327    #[serde(
328        rename = "createRuntime",
329        default,
330        skip_serializing_if = "Vec::is_empty"
331    )]
332    pub create_runtime: Vec<OciHook>,
333    /// Called after pivot_root but before the start operation.
334    #[serde(
335        rename = "createContainer",
336        default,
337        skip_serializing_if = "Vec::is_empty"
338    )]
339    pub create_container: Vec<OciHook>,
340    /// Called after the start operation but before the user process executes.
341    #[serde(
342        rename = "startContainer",
343        default,
344        skip_serializing_if = "Vec::is_empty"
345    )]
346    pub start_container: Vec<OciHook>,
347    /// Called after the user-specified process has started.
348    #[serde(default, skip_serializing_if = "Vec::is_empty")]
349    pub poststart: Vec<OciHook>,
350    /// Called after the container has been stopped.
351    #[serde(default, skip_serializing_if = "Vec::is_empty")]
352    pub poststop: Vec<OciHook>,
353}
354
355/// Container state JSON passed to OCI hooks on stdin.
356///
357/// Spec: <https://github.com/opencontainers/runtime-spec/blob/main/runtime.md#state>
358#[derive(Debug, Clone, Serialize)]
359pub struct OciContainerState {
360    #[serde(rename = "ociVersion")]
361    pub oci_version: String,
362    pub id: String,
363    pub status: OciStatus,
364    pub pid: u32,
365    pub bundle: String,
366}
367
368impl OciHooks {
369    /// Returns true if there are no hooks configured.
370    pub fn is_empty(&self) -> bool {
371        self.create_runtime.is_empty()
372            && self.create_container.is_empty()
373            && self.start_container.is_empty()
374            && self.poststart.is_empty()
375            && self.poststop.is_empty()
376    }
377
378    /// Execute a list of hooks in order, passing container state JSON on stdin.
379    ///
380    /// If any hook exits non-zero, an error is returned immediately (remaining hooks are skipped).
381    pub fn run_hooks(hooks: &[OciHook], state: &OciContainerState, phase: &str) -> Result<()> {
382        let state_json = serde_json::to_string(state).map_err(|e| {
383            NucleusError::HookError(format!(
384                "Failed to serialize container state for hook: {}",
385                e
386            ))
387        })?;
388
389        for (i, hook) in hooks.iter().enumerate() {
390            info!(
391                "Running {} hook [{}/{}]: {}",
392                phase,
393                i + 1,
394                hooks.len(),
395                hook.path
396            );
397            Self::execute_hook(hook, &state_json, phase)?;
398        }
399
400        Ok(())
401    }
402
403    /// Execute a list of hooks best-effort (log errors but don't fail).
404    ///
405    /// Used for poststop hooks per the OCI spec: errors MUST be logged but MUST NOT
406    /// prevent cleanup.
407    pub fn run_hooks_best_effort(hooks: &[OciHook], state: &OciContainerState, phase: &str) {
408        let state_json = match serde_json::to_string(state) {
409            Ok(json) => json,
410            Err(e) => {
411                warn!(
412                    "Failed to serialize container state for {} hooks: {}",
413                    phase, e
414                );
415                return;
416            }
417        };
418
419        for (i, hook) in hooks.iter().enumerate() {
420            info!(
421                "Running {} hook [{}/{}]: {}",
422                phase,
423                i + 1,
424                hooks.len(),
425                hook.path
426            );
427            if let Err(e) = Self::execute_hook(hook, &state_json, phase) {
428                warn!("{} hook [{}] failed (continuing): {}", phase, i + 1, e);
429            }
430        }
431    }
432
433    fn execute_hook(hook: &OciHook, state_json: &str, phase: &str) -> Result<()> {
434        #[cfg(not(test))]
435        use std::os::unix::process::CommandExt;
436        use std::process::{Command, Stdio};
437
438        let hook_path = Path::new(&hook.path);
439        if !hook_path.is_absolute() {
440            return Err(NucleusError::HookError(format!(
441                "{} hook path must be absolute: {}",
442                phase, hook.path
443            )));
444        }
445
446        // Restrict hooks to trusted system directories. Hooks execute in
447        // the parent process before security hardening (by OCI spec), so
448        // they must come from locations that unprivileged users cannot write to.
449        #[cfg(not(test))]
450        {
451            const TRUSTED_HOOK_PREFIXES: &[&str] = &[
452                "/usr/bin/",
453                "/usr/sbin/",
454                "/usr/lib/",
455                "/usr/libexec/",
456                "/usr/local/bin/",
457                "/usr/local/sbin/",
458                "/usr/local/libexec/",
459                "/bin/",
460                "/sbin/",
461                "/nix/store/",
462                "/opt/",
463            ];
464            if !TRUSTED_HOOK_PREFIXES
465                .iter()
466                .any(|prefix| hook.path.starts_with(prefix))
467            {
468                return Err(NucleusError::HookError(format!(
469                    "{} hook path '{}' is not under a trusted directory ({:?})",
470                    phase, hook.path, TRUSTED_HOOK_PREFIXES
471                )));
472            }
473        }
474
475        // Use symlink_metadata (lstat) instead of .exists() to avoid
476        // following symlinks in the existence check. Reject symlinked hooks
477        // to prevent a TOCTOU swap between the check and exec.
478        match std::fs::symlink_metadata(hook_path) {
479            Ok(meta) if meta.file_type().is_symlink() => {
480                return Err(NucleusError::HookError(format!(
481                    "{} hook path is a symlink (refusing to follow): {}",
482                    phase, hook.path
483                )));
484            }
485            Err(_) => {
486                return Err(NucleusError::HookError(format!(
487                    "{} hook binary not found: {}",
488                    phase, hook.path
489                )));
490            }
491            Ok(_) => {}
492        }
493
494        // C-1: Validate hook binary ownership and permissions to prevent
495        // execution of world-writable or unexpectedly-owned binaries.
496        // Similar to runsc's hook validation – reject hooks that could be
497        // tampered with by unprivileged users.
498        Self::validate_hook_binary(hook_path, phase)?;
499
500        let mut cmd = Command::new(&hook.path);
501        if !hook.args.is_empty() {
502            // OCI spec: args[0] is the binary name (like execve argv); pass rest as arguments
503            cmd.args(&hook.args[1..]);
504        }
505
506        if !hook.env.is_empty() {
507            cmd.env_clear();
508            for entry in &hook.env {
509                if let Some((key, value)) = entry.split_once('=') {
510                    cmd.env(key, value);
511                }
512            }
513        }
514
515        // C-1: Drop all capabilities and set restrictive resource limits
516        // for hook execution. Hooks run in the parent process before security
517        // hardening, so we sandbox them defensively.
518        cmd.stdin(Stdio::piped());
519        cmd.stdout(Stdio::piped());
520        cmd.stderr(Stdio::piped());
521
522        // C-1: Apply RLIMIT backstops only in the spawned child process
523        // via pre_exec, so the parent process is not affected.
524        // Note: pre_exec runs after fork but before exec, in the child process.
525        #[cfg(not(test))]
526        unsafe {
527            cmd.pre_exec(|| {
528                // Prevent the hook from gaining privileges via setuid/setgid
529                // binaries or file capabilities. This must be set before exec.
530                if libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0 {
531                    return Err(std::io::Error::last_os_error());
532                }
533
534                let rlim_nproc = libc::rlimit {
535                    rlim_cur: 1024,
536                    rlim_max: 1024,
537                };
538                if libc::setrlimit(libc::RLIMIT_NPROC, &rlim_nproc) != 0 {
539                    return Err(std::io::Error::last_os_error());
540                }
541
542                let rlim_nofile = libc::rlimit {
543                    rlim_cur: 1024,
544                    rlim_max: 1024,
545                };
546                if libc::setrlimit(libc::RLIMIT_NOFILE, &rlim_nofile) != 0 {
547                    return Err(std::io::Error::last_os_error());
548                }
549
550                Ok(())
551            });
552        }
553
554        const TEXT_FILE_BUSY_SPAWN_RETRIES: usize = 100;
555        const TEXT_FILE_BUSY_RETRY_DELAY: std::time::Duration =
556            std::time::Duration::from_millis(10);
557
558        let mut text_file_busy_retries = 0;
559        let mut child = loop {
560            match cmd.spawn() {
561                Ok(child) => break child,
562                Err(e)
563                    if e.raw_os_error() == Some(libc::ETXTBSY)
564                        && text_file_busy_retries < TEXT_FILE_BUSY_SPAWN_RETRIES =>
565                {
566                    text_file_busy_retries += 1;
567                    debug!(
568                        "{} hook {} was busy during spawn; retrying ({}/{})",
569                        phase, hook.path, text_file_busy_retries, TEXT_FILE_BUSY_SPAWN_RETRIES
570                    );
571                    std::thread::sleep(TEXT_FILE_BUSY_RETRY_DELAY);
572                }
573                Err(e) => {
574                    return Err(NucleusError::HookError(format!(
575                        "Failed to spawn {} hook {}: {}",
576                        phase, hook.path, e
577                    )));
578                }
579            }
580        };
581
582        if let Some(mut stdin) = child.stdin.take() {
583            use std::io::Write as IoWrite;
584            let _ = stdin.write_all(state_json.as_bytes());
585        }
586
587        let timeout_secs = hook.timeout.unwrap_or(30) as u64;
588        let start = std::time::Instant::now();
589        let timeout = std::time::Duration::from_secs(timeout_secs);
590
591        loop {
592            match child.try_wait() {
593                Ok(Some(status)) => {
594                    if status.success() {
595                        debug!("{} hook {} completed successfully", phase, hook.path);
596                        return Ok(());
597                    } else {
598                        let stderr = child
599                            .stderr
600                            .take()
601                            .map(|mut e| {
602                                let mut buf = String::new();
603                                use std::io::Read;
604                                let _ = e.read_to_string(&mut buf);
605                                buf
606                            })
607                            .unwrap_or_default();
608                        return Err(NucleusError::HookError(format!(
609                            "{} hook {} exited with status: {}{}",
610                            phase,
611                            hook.path,
612                            status,
613                            if stderr.is_empty() {
614                                String::new()
615                            } else {
616                                format!(" (stderr: {})", stderr.trim())
617                            }
618                        )));
619                    }
620                }
621                Ok(None) => {
622                    if start.elapsed() >= timeout {
623                        let _ = child.kill();
624                        let _ = child.wait();
625                        return Err(NucleusError::HookError(format!(
626                            "{} hook {} timed out after {}s",
627                            phase, hook.path, timeout_secs
628                        )));
629                    }
630                    std::thread::sleep(std::time::Duration::from_millis(50));
631                }
632                Err(e) => {
633                    return Err(NucleusError::HookError(format!(
634                        "Failed to wait for {} hook {}: {}",
635                        phase, hook.path, e
636                    )));
637                }
638            }
639        }
640    }
641
642    /// Validate hook binary ownership and permissions.
643    ///
644    /// Rejects hooks that are world-writable or group-writable, or owned by
645    /// a UID that doesn't match the effective UID or root. This prevents
646    /// privilege escalation via tampered hook binaries.
647    fn validate_hook_binary(hook_path: &Path, phase: &str) -> Result<()> {
648        // Use symlink_metadata (lstat) to inspect the hook path itself
649        // rather than following symlinks, consistent with the rejection
650        // of symlinked hooks above.
651        let metadata = std::fs::symlink_metadata(hook_path).map_err(|e| {
652            NucleusError::HookError(format!(
653                "Failed to stat {} hook {}: {}",
654                phase,
655                hook_path.display(),
656                e
657            ))
658        })?;
659
660        use std::os::unix::fs::MetadataExt;
661        let mode = metadata.mode();
662        let uid = metadata.uid();
663        let gid = metadata.gid();
664        let effective_uid = nix::unistd::Uid::effective().as_raw();
665
666        // Reject world-writable hooks
667        if mode & 0o002 != 0 {
668            return Err(NucleusError::HookError(format!(
669                "{} hook {} is world-writable (mode {:04o}) – refusing to execute",
670                phase,
671                hook_path.display(),
672                mode & 0o7777
673            )));
674        }
675
676        // Reject group-writable hooks unless owned by root
677        if mode & 0o020 != 0 && uid != 0 {
678            return Err(NucleusError::HookError(format!(
679                "{} hook {} is group-writable and not owned by root (mode {:04o}, uid {}) – refusing to execute",
680                phase,
681                hook_path.display(),
682                mode & 0o7777,
683                uid
684            )));
685        }
686
687        // Reject hooks owned by arbitrary UIDs – must be root or effective UID
688        if uid != 0 && uid != effective_uid {
689            return Err(NucleusError::HookError(format!(
690                "{} hook {} is owned by UID {} (expected 0 or {}) – refusing to execute",
691                phase,
692                hook_path.display(),
693                uid,
694                effective_uid
695            )));
696        }
697
698        // Reject hooks with setuid/setgid bits
699        if mode & 0o6000 != 0 {
700            return Err(NucleusError::HookError(format!(
701                "{} hook {} has setuid/setgid bits (mode {:04o}) – refusing to execute",
702                phase,
703                hook_path.display(),
704                mode & 0o7777
705            )));
706        }
707
708        debug!(
709            "{} hook {} validation passed (uid={}, gid={}, mode={:04o})",
710            phase,
711            hook_path.display(),
712            uid,
713            gid,
714            mode & 0o7777
715        );
716
717        Ok(())
718    }
719}
720
721impl OciConfig {
722    /// Create a minimal OCI config for Nucleus containers
723    pub fn new(command: Vec<String>, hostname: Option<String>) -> Self {
724        Self {
725            oci_version: "1.0.2".to_string(),
726            root: OciRoot {
727                path: "rootfs".to_string(),
728                readonly: true,
729            },
730            process: OciProcess {
731                terminal: false,
732                user: OciUser {
733                    uid: 0,
734                    gid: 0,
735                    additional_gids: None,
736                },
737                args: command,
738                env: vec![
739                    "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(),
740                ],
741                cwd: "/".to_string(),
742                no_new_privileges: true,
743                capabilities: Some(OciCapabilities {
744                    bounding: vec![],
745                    effective: vec![],
746                    inheritable: vec![],
747                    permitted: vec![],
748                    ambient: vec![],
749                }),
750                rlimits: vec![],
751                console_size: None,
752                apparmor_profile: None,
753                selinux_label: None,
754            },
755            hostname,
756            mounts: vec![
757                OciMount {
758                    destination: "/proc".to_string(),
759                    source: "proc".to_string(),
760                    mount_type: "proc".to_string(),
761                    options: vec![
762                        "nosuid".to_string(),
763                        "noexec".to_string(),
764                        "nodev".to_string(),
765                    ],
766                },
767                OciMount {
768                    destination: "/dev".to_string(),
769                    source: "tmpfs".to_string(),
770                    mount_type: "tmpfs".to_string(),
771                    options: vec![
772                        "nosuid".to_string(),
773                        "noexec".to_string(),
774                        "strictatime".to_string(),
775                        "mode=755".to_string(),
776                        "size=65536k".to_string(),
777                    ],
778                },
779                OciMount {
780                    destination: "/dev/shm".to_string(),
781                    source: "shm".to_string(),
782                    mount_type: "tmpfs".to_string(),
783                    options: vec![
784                        "nosuid".to_string(),
785                        "noexec".to_string(),
786                        "nodev".to_string(),
787                        "mode=1777".to_string(),
788                        "size=65536k".to_string(),
789                    ],
790                },
791                OciMount {
792                    destination: "/tmp".to_string(),
793                    source: "tmpfs".to_string(),
794                    mount_type: "tmpfs".to_string(),
795                    options: vec![
796                        "nosuid".to_string(),
797                        "nodev".to_string(),
798                        "noexec".to_string(),
799                        "mode=1777".to_string(),
800                        "size=65536k".to_string(),
801                    ],
802                },
803                OciMount {
804                    destination: "/sys".to_string(),
805                    source: "sysfs".to_string(),
806                    mount_type: "sysfs".to_string(),
807                    options: vec![
808                        "nosuid".to_string(),
809                        "noexec".to_string(),
810                        "nodev".to_string(),
811                        "ro".to_string(),
812                    ],
813                },
814            ],
815            hooks: None,
816            annotations: HashMap::new(),
817            linux: Some(OciLinux {
818                namespaces: Some(vec![
819                    OciNamespace {
820                        namespace_type: "pid".to_string(),
821                    },
822                    OciNamespace {
823                        namespace_type: "network".to_string(),
824                    },
825                    OciNamespace {
826                        namespace_type: "ipc".to_string(),
827                    },
828                    OciNamespace {
829                        namespace_type: "uts".to_string(),
830                    },
831                    OciNamespace {
832                        namespace_type: "mount".to_string(),
833                    },
834                ]),
835                resources: None,
836                uid_mappings: vec![],
837                gid_mappings: vec![],
838                // M14: Aligned with native masked paths in mount.rs (PROC_NULL_MASKED)
839                masked_paths: vec![
840                    "/proc/acpi".to_string(),
841                    "/proc/asound".to_string(),
842                    "/proc/kcore".to_string(),
843                    "/proc/keys".to_string(),
844                    "/proc/latency_stats".to_string(),
845                    "/proc/sched_debug".to_string(),
846                    "/proc/scsi".to_string(),
847                    "/proc/timer_list".to_string(),
848                    "/proc/timer_stats".to_string(),
849                    "/proc/sysrq-trigger".to_string(), // M14: null-mask, not read-only
850                    "/proc/kpagecount".to_string(),
851                    "/proc/kpageflags".to_string(),
852                    "/proc/kpagecgroup".to_string(),
853                    "/proc/config.gz".to_string(),
854                    "/proc/kallsyms".to_string(),
855                    "/sys/firmware".to_string(),
856                ],
857                readonly_paths: vec![
858                    "/proc/bus".to_string(),
859                    "/proc/fs".to_string(),
860                    "/proc/irq".to_string(),
861                    "/proc/sys".to_string(),
862                ],
863                devices: vec![
864                    OciDevice {
865                        device_type: "c".to_string(),
866                        path: "/dev/null".to_string(),
867                        major: Some(1),
868                        minor: Some(3),
869                        file_mode: Some(0o666),
870                        uid: Some(0),
871                        gid: Some(0),
872                    },
873                    OciDevice {
874                        device_type: "c".to_string(),
875                        path: "/dev/zero".to_string(),
876                        major: Some(1),
877                        minor: Some(5),
878                        file_mode: Some(0o666),
879                        uid: Some(0),
880                        gid: Some(0),
881                    },
882                    OciDevice {
883                        device_type: "c".to_string(),
884                        path: "/dev/full".to_string(),
885                        major: Some(1),
886                        minor: Some(7),
887                        file_mode: Some(0o666),
888                        uid: Some(0),
889                        gid: Some(0),
890                    },
891                    OciDevice {
892                        device_type: "c".to_string(),
893                        path: "/dev/random".to_string(),
894                        major: Some(1),
895                        minor: Some(8),
896                        file_mode: Some(0o666),
897                        uid: Some(0),
898                        gid: Some(0),
899                    },
900                    OciDevice {
901                        device_type: "c".to_string(),
902                        path: "/dev/urandom".to_string(),
903                        major: Some(1),
904                        minor: Some(9),
905                        file_mode: Some(0o666),
906                        uid: Some(0),
907                        gid: Some(0),
908                    },
909                ],
910                seccomp: None,
911                rootfs_propagation: Some("rprivate".to_string()),
912                sysctl: HashMap::new(),
913                cgroups_path: None,
914                intel_rdt: None,
915            }),
916        }
917    }
918
919    /// Add resource limits to the config
920    pub fn with_resources(mut self, limits: &ResourceLimits) -> Self {
921        let mut resources = OciResources {
922            memory: None,
923            cpu: None,
924            pids: None,
925        };
926
927        if let Some(memory_bytes) = limits.memory_bytes {
928            resources.memory = Some(OciMemory {
929                limit: Some(memory_bytes as i64),
930            });
931        }
932
933        if let Some(quota_us) = limits.cpu_quota_us {
934            resources.cpu = Some(OciCpu {
935                quota: Some(quota_us as i64),
936                period: Some(limits.cpu_period_us),
937            });
938        }
939
940        if let Some(pids_max) = limits.pids_max {
941            resources.pids = Some(OciPids {
942                limit: pids_max as i64,
943            });
944        }
945
946        if let Some(linux) = &mut self.linux {
947            linux.resources = Some(resources);
948        }
949
950        self
951    }
952
953    /// Configure the OCI noNewPrivileges process flag.
954    pub fn with_no_new_privileges(mut self, enabled: bool) -> Self {
955        self.process.no_new_privileges = enabled;
956        self
957    }
958
959    /// Add environment variables to the OCI process config.
960    pub fn with_env(mut self, vars: &[(String, String)]) -> Self {
961        for (key, value) in vars {
962            self.process.env.push(format!("{}={}", key, value));
963        }
964        self
965    }
966
967    /// Add sd_notify socket passthrough.
968    pub fn with_sd_notify(mut self) -> Self {
969        if let Ok(notify_socket) = std::env::var("NOTIFY_SOCKET") {
970            self.process
971                .env
972                .push(format!("NOTIFY_SOCKET={}", notify_socket));
973        }
974        self
975    }
976
977    /// Add bind mounts for secrets.
978    pub fn with_secret_mounts(mut self, secrets: &[crate::container::SecretMount]) -> Self {
979        for secret in secrets {
980            self.mounts.push(OciMount {
981                destination: secret.dest.to_string_lossy().to_string(),
982                source: secret.source.to_string_lossy().to_string(),
983                mount_type: "bind".to_string(),
984                options: vec![
985                    "bind".to_string(),
986                    "ro".to_string(),
987                    "nosuid".to_string(),
988                    "nodev".to_string(),
989                    "noexec".to_string(),
990                ],
991            });
992        }
993        self
994    }
995
996    /// Set the process identity for the OCI workload.
997    pub fn with_process_identity(mut self, identity: &crate::container::ProcessIdentity) -> Self {
998        self.process.user.uid = identity.uid;
999        self.process.user.gid = identity.gid;
1000        self.process.user.additional_gids = if identity.additional_gids.is_empty() {
1001            None
1002        } else {
1003            Some(identity.additional_gids.clone())
1004        };
1005        self
1006    }
1007
1008    /// Add a read-only bind mount of an in-memory secret staging directory at
1009    /// `/run/secrets`, plus compatibility bind mounts for each staged secret to
1010    /// its requested container destination.
1011    pub fn with_inmemory_secret_mounts(
1012        mut self,
1013        stage_dir: &Path,
1014        secrets: &[crate::container::SecretMount],
1015    ) -> Result<Self> {
1016        self.mounts.push(OciMount {
1017            destination: "/run/secrets".to_string(),
1018            source: stage_dir.to_string_lossy().to_string(),
1019            mount_type: "bind".to_string(),
1020            options: vec![
1021                "bind".to_string(),
1022                "ro".to_string(),
1023                "nosuid".to_string(),
1024                "nodev".to_string(),
1025                "noexec".to_string(),
1026            ],
1027        });
1028
1029        for secret in secrets {
1030            let dest = normalize_container_destination(&secret.dest)?;
1031            if !secret.source.starts_with(stage_dir) {
1032                return Err(NucleusError::ConfigError(format!(
1033                    "Staged secret source {:?} must live under {:?}",
1034                    secret.source, stage_dir
1035                )));
1036            }
1037            self.mounts.push(OciMount {
1038                destination: dest.to_string_lossy().to_string(),
1039                source: secret.source.to_string_lossy().to_string(),
1040                mount_type: "bind".to_string(),
1041                options: vec![
1042                    "bind".to_string(),
1043                    "ro".to_string(),
1044                    "nosuid".to_string(),
1045                    "nodev".to_string(),
1046                    "noexec".to_string(),
1047                ],
1048            });
1049        }
1050
1051        Ok(self)
1052    }
1053
1054    /// Add bind or tmpfs volume mounts.
1055    pub fn with_volume_mounts(mut self, volumes: &[crate::container::VolumeMount]) -> Result<Self> {
1056        use crate::container::VolumeSource;
1057
1058        for volume in volumes {
1059            let dest = normalize_volume_destination(&volume.dest)?;
1060            match &volume.source {
1061                VolumeSource::Bind { source } => {
1062                    crate::filesystem::validate_bind_mount_source(source)?;
1063                    let mut options = vec![
1064                        "bind".to_string(),
1065                        "nosuid".to_string(),
1066                        "nodev".to_string(),
1067                    ];
1068                    if volume.read_only {
1069                        options.push("ro".to_string());
1070                    }
1071                    self.mounts.push(OciMount {
1072                        destination: dest.to_string_lossy().to_string(),
1073                        source: source.to_string_lossy().to_string(),
1074                        mount_type: "bind".to_string(),
1075                        options,
1076                    });
1077                }
1078                VolumeSource::Tmpfs { size } => {
1079                    let mut options = vec![
1080                        "nosuid".to_string(),
1081                        "nodev".to_string(),
1082                        "mode=0755".to_string(),
1083                    ];
1084                    if volume.read_only {
1085                        options.push("ro".to_string());
1086                    }
1087                    if let Some(size) = size {
1088                        options.push(format!("size={}", size));
1089                    }
1090                    self.mounts.push(OciMount {
1091                        destination: dest.to_string_lossy().to_string(),
1092                        source: "tmpfs".to_string(),
1093                        mount_type: "tmpfs".to_string(),
1094                        options,
1095                    });
1096                }
1097            }
1098        }
1099
1100        Ok(self)
1101    }
1102
1103    /// Bind mount the host context directory into the container.
1104    ///
1105    /// The gVisor integration path expects `/context` to be writable so test
1106    /// workloads can write results back to the host.
1107    pub fn with_context_bind(mut self, context_dir: &std::path::Path) -> Self {
1108        self.mounts.push(OciMount {
1109            destination: "/context".to_string(),
1110            source: context_dir.to_string_lossy().to_string(),
1111            mount_type: "bind".to_string(),
1112            options: vec![
1113                "bind".to_string(),
1114                "ro".to_string(),
1115                "nosuid".to_string(),
1116                "nodev".to_string(),
1117            ],
1118        });
1119        self
1120    }
1121
1122    /// Add rootfs bind mounts from a pre-built rootfs path.
1123    pub fn with_rootfs_binds(mut self, rootfs_path: &std::path::Path) -> Self {
1124        let subdirs = ["bin", "sbin", "lib", "lib64", "usr", "etc", "nix"];
1125        for subdir in &subdirs {
1126            let source = rootfs_path.join(subdir);
1127            if source.exists() {
1128                self.mounts.push(OciMount {
1129                    destination: format!("/{}", subdir),
1130                    source: source.to_string_lossy().to_string(),
1131                    mount_type: "bind".to_string(),
1132                    options: vec![
1133                        "bind".to_string(),
1134                        "ro".to_string(),
1135                        "nosuid".to_string(),
1136                        "nodev".to_string(),
1137                    ],
1138                });
1139            }
1140        }
1141        self
1142    }
1143
1144    /// Replace the default namespace list with an explicit configuration.
1145    pub fn with_namespace_config(mut self, config: &NamespaceConfig) -> Self {
1146        let mut namespaces = Vec::new();
1147
1148        if config.pid {
1149            namespaces.push(OciNamespace {
1150                namespace_type: "pid".to_string(),
1151            });
1152        }
1153        if config.net {
1154            namespaces.push(OciNamespace {
1155                namespace_type: "network".to_string(),
1156            });
1157        }
1158        if config.ipc {
1159            namespaces.push(OciNamespace {
1160                namespace_type: "ipc".to_string(),
1161            });
1162        }
1163        if config.uts {
1164            namespaces.push(OciNamespace {
1165                namespace_type: "uts".to_string(),
1166            });
1167        }
1168        if config.mnt {
1169            namespaces.push(OciNamespace {
1170                namespace_type: "mount".to_string(),
1171            });
1172        }
1173        if config.cgroup {
1174            namespaces.push(OciNamespace {
1175                namespace_type: "cgroup".to_string(),
1176            });
1177        }
1178        if config.time {
1179            namespaces.push(OciNamespace {
1180                namespace_type: "time".to_string(),
1181            });
1182        }
1183        if config.user {
1184            namespaces.push(OciNamespace {
1185                namespace_type: "user".to_string(),
1186            });
1187        }
1188
1189        if let Some(linux) = &mut self.linux {
1190            linux.namespaces = Some(namespaces);
1191        }
1192
1193        self
1194    }
1195
1196    /// Add read-only bind mounts for host runtime paths.
1197    ///
1198    /// This mirrors the native fallback path for non-production containers so
1199    /// common executables such as `/bin/sh` remain available inside the OCI
1200    /// rootfs when no explicit rootfs is configured.
1201    pub fn with_host_runtime_binds(mut self) -> Self {
1202        // Use a fixed set of standard FHS paths only. Do NOT scan host $PATH,
1203        // which would expose arbitrary host directories inside the container.
1204        let host_paths: BTreeSet<String> =
1205            ["/bin", "/sbin", "/usr", "/lib", "/lib64", "/nix/store"]
1206                .iter()
1207                .map(|s| s.to_string())
1208                .collect();
1209
1210        for host_path in host_paths {
1211            let source = Path::new(&host_path);
1212            if !source.exists() {
1213                continue;
1214            }
1215
1216            self.mounts.push(OciMount {
1217                destination: host_path.clone(),
1218                source: source.to_string_lossy().to_string(),
1219                mount_type: "bind".to_string(),
1220                options: vec![
1221                    "bind".to_string(),
1222                    "ro".to_string(),
1223                    "nosuid".to_string(),
1224                    "nodev".to_string(),
1225                ],
1226            });
1227        }
1228        self
1229    }
1230
1231    /// Add user namespace configuration
1232    pub fn with_user_namespace(mut self) -> Self {
1233        if let Some(linux) = &mut self.linux {
1234            if let Some(namespaces) = &mut linux.namespaces {
1235                namespaces.push(OciNamespace {
1236                    namespace_type: "user".to_string(),
1237                });
1238            }
1239        }
1240        self
1241    }
1242
1243    /// Remove the OCI network namespace entry so runsc inherits the process
1244    /// network namespace that Nucleus prepared before exec.
1245    pub fn without_network_namespace(mut self) -> Self {
1246        if let Some(linux) = &mut self.linux {
1247            if let Some(namespaces) = &mut linux.namespaces {
1248                namespaces.retain(|ns| ns.namespace_type != "network");
1249            }
1250        }
1251
1252        self
1253    }
1254
1255    /// Configure gVisor's true rootless OCI path.
1256    ///
1257    /// gVisor expects UID/GID mappings in the OCI spec for this mode, and its
1258    /// rootless OCI implementation does not currently support a network
1259    /// namespace entry in the spec. We still control networking through
1260    /// runsc's top-level `--network` flag.
1261    pub fn with_rootless_user_namespace(mut self, config: &UserNamespaceConfig) -> Self {
1262        if let Some(linux) = &mut self.linux {
1263            if let Some(namespaces) = &mut linux.namespaces {
1264                namespaces.retain(|ns| ns.namespace_type != "network");
1265                if !namespaces.iter().any(|ns| ns.namespace_type == "user") {
1266                    namespaces.push(OciNamespace {
1267                        namespace_type: "user".to_string(),
1268                    });
1269                }
1270            }
1271            linux.uid_mappings = config.uid_mappings.iter().map(OciIdMapping::from).collect();
1272            linux.gid_mappings = config.gid_mappings.iter().map(OciIdMapping::from).collect();
1273        }
1274        self
1275    }
1276
1277    /// Set OCI lifecycle hooks on the config.
1278    pub fn with_hooks(mut self, hooks: OciHooks) -> Self {
1279        if hooks.is_empty() {
1280            self.hooks = None;
1281        } else {
1282            self.hooks = Some(hooks);
1283        }
1284        self
1285    }
1286
1287    /// Set process rlimits from the Nucleus runtime defaults and configured limits.
1288    ///
1289    /// Mirrors the RLIMIT backstops applied in-process for native containers
1290    /// (runtime.rs), expressed as OCI config so gVisor can enforce them.
1291    pub fn with_rlimits(mut self, limits: &ResourceLimits) -> Self {
1292        let mut rlimits = Vec::with_capacity(3);
1293
1294        if let Some(nproc_limit) = limits.pids_max {
1295            rlimits.push(OciRlimit {
1296                limit_type: "RLIMIT_NPROC".to_string(),
1297                hard: nproc_limit,
1298                soft: nproc_limit,
1299            });
1300        }
1301
1302        rlimits.push(OciRlimit {
1303            limit_type: "RLIMIT_NOFILE".to_string(),
1304            hard: 1024,
1305            soft: 1024,
1306        });
1307
1308        let memlock_limit = limits.memlock_bytes.unwrap_or(64 * 1024);
1309        rlimits.push(OciRlimit {
1310            limit_type: "RLIMIT_MEMLOCK".to_string(),
1311            hard: memlock_limit,
1312            soft: memlock_limit,
1313        });
1314
1315        self.process.rlimits = rlimits;
1316        self
1317    }
1318
1319    /// Set the linux.seccomp section from an OCI seccomp config.
1320    pub fn with_seccomp(mut self, seccomp: OciSeccomp) -> Self {
1321        if let Some(linux) = &mut self.linux {
1322            linux.seccomp = Some(seccomp);
1323        }
1324        self
1325    }
1326
1327    /// Set the linux.cgroupsPath field.
1328    pub fn with_cgroups_path(mut self, path: String) -> Self {
1329        if let Some(linux) = &mut self.linux {
1330            linux.cgroups_path = Some(path);
1331        }
1332        self
1333    }
1334
1335    /// Set sysctl key-value pairs on the linux config.
1336    pub fn with_sysctl(mut self, sysctl: HashMap<String, String>) -> Self {
1337        if let Some(linux) = &mut self.linux {
1338            linux.sysctl = sysctl;
1339        }
1340        self
1341    }
1342
1343    /// Set annotations on the OCI config.
1344    pub fn with_annotations(mut self, annotations: HashMap<String, String>) -> Self {
1345        self.annotations = annotations;
1346        self
1347    }
1348}
1349
1350impl From<&IdMapping> for OciIdMapping {
1351    fn from(mapping: &IdMapping) -> Self {
1352        Self {
1353            container_id: mapping.container_id,
1354            host_id: mapping.host_id,
1355            size: mapping.count,
1356        }
1357    }
1358}
1359
1360/// OCI Bundle manager
1361///
1362/// Creates and manages OCI-compliant bundles for gVisor
1363pub struct OciBundle {
1364    bundle_path: PathBuf,
1365    config: OciConfig,
1366}
1367
1368fn safe_child_name(name: &str) -> std::io::Result<CString> {
1369    if name.is_empty() || name == "." || name == ".." || name.contains('/') {
1370        return Err(std::io::Error::new(
1371            std::io::ErrorKind::InvalidInput,
1372            "invalid path child name",
1373        ));
1374    }
1375
1376    CString::new(name).map_err(|_| {
1377        std::io::Error::new(
1378            std::io::ErrorKind::InvalidInput,
1379            "path child name contains NUL",
1380        )
1381    })
1382}
1383
1384fn open_dir_nofollow(path: &Path) -> std::io::Result<fs::File> {
1385    OpenOptions::new()
1386        .read(true)
1387        .custom_flags(libc::O_DIRECTORY | libc::O_NOFOLLOW | libc::O_CLOEXEC)
1388        .open(path)
1389}
1390
1391fn mkdirat_dir(parent: &fs::File, name: &str, mode: libc::mode_t) -> std::io::Result<()> {
1392    let name = safe_child_name(name)?;
1393    let result = unsafe { libc::mkdirat(parent.as_raw_fd(), name.as_ptr(), mode) };
1394
1395    if result == 0 {
1396        return Ok(());
1397    }
1398
1399    let err = std::io::Error::last_os_error();
1400    if err.raw_os_error() == Some(libc::EEXIST) {
1401        Ok(())
1402    } else {
1403        Err(err)
1404    }
1405}
1406
1407fn openat_dir_nofollow(parent: &fs::File, name: &str) -> std::io::Result<fs::File> {
1408    let name = safe_child_name(name)?;
1409    let fd = unsafe {
1410        libc::openat(
1411            parent.as_raw_fd(),
1412            name.as_ptr(),
1413            libc::O_RDONLY | libc::O_DIRECTORY | libc::O_NOFOLLOW | libc::O_CLOEXEC,
1414        )
1415    };
1416
1417    if fd < 0 {
1418        Err(std::io::Error::last_os_error())
1419    } else {
1420        Ok(unsafe { fs::File::from_raw_fd(fd) })
1421    }
1422}
1423
1424fn openat_file_nofollow(
1425    parent: &fs::File,
1426    name: &str,
1427    mode: libc::mode_t,
1428) -> std::io::Result<fs::File> {
1429    let name = safe_child_name(name)?;
1430    let fd = unsafe {
1431        libc::openat(
1432            parent.as_raw_fd(),
1433            name.as_ptr(),
1434            libc::O_WRONLY | libc::O_CREAT | libc::O_TRUNC | libc::O_NOFOLLOW | libc::O_CLOEXEC,
1435            mode,
1436        )
1437    };
1438
1439    if fd < 0 {
1440        Err(std::io::Error::last_os_error())
1441    } else {
1442        Ok(unsafe { fs::File::from_raw_fd(fd) })
1443    }
1444}
1445
1446impl OciBundle {
1447    /// Create a new OCI bundle
1448    pub fn new(bundle_path: PathBuf, config: OciConfig) -> Self {
1449        Self {
1450            bundle_path,
1451            config,
1452        }
1453    }
1454
1455    /// Create the bundle directory structure and write config.json
1456    pub fn create(&self) -> Result<()> {
1457        info!("Creating OCI bundle at {:?}", self.bundle_path);
1458
1459        // Create the bundle directory, then reopen it without following a final
1460        // symlink before applying permissions or creating children beneath it.
1461        fs::create_dir_all(&self.bundle_path).map_err(|e| {
1462            NucleusError::GVisorError(format!(
1463                "Failed to create bundle directory {:?}: {}",
1464                self.bundle_path, e
1465            ))
1466        })?;
1467        let bundle_dir = open_dir_nofollow(&self.bundle_path).map_err(|e| {
1468            NucleusError::GVisorError(format!(
1469                "Failed to open bundle directory safely {:?}: {}",
1470                self.bundle_path, e
1471            ))
1472        })?;
1473        bundle_dir
1474            .set_permissions(fs::Permissions::from_mode(0o700))
1475            .map_err(|e| {
1476                NucleusError::GVisorError(format!(
1477                    "Failed to secure bundle directory permissions {:?}: {}",
1478                    self.bundle_path, e
1479                ))
1480            })?;
1481
1482        // Create rootfs relative to the trusted bundle directory. mkdirat plus
1483        // openat(O_NOFOLLOW) prevents a pre-existing rootfs symlink from being
1484        // chmodded when rootfs needs to be traversable for non-root UIDs.
1485        let rootfs = self.bundle_path.join("rootfs");
1486        mkdirat_dir(&bundle_dir, "rootfs", 0o755).map_err(|e| {
1487            NucleusError::GVisorError(format!("Failed to create rootfs directory: {}", e))
1488        })?;
1489        let rootfs_dir = openat_dir_nofollow(&bundle_dir, "rootfs").map_err(|e| {
1490            NucleusError::GVisorError(format!(
1491                "Failed to open rootfs directory safely {:?}: {}",
1492                rootfs, e
1493            ))
1494        })?;
1495        // The rootfs is the container's "/" – it must be traversable by the
1496        // container UID which may be non-root (via --user).  Mode 0755 matches
1497        // the standard Linux root directory permission and lets gVisor's VFS
1498        // permit path traversal for any UID.
1499        rootfs_dir
1500            .set_permissions(fs::Permissions::from_mode(0o755))
1501            .map_err(|e| {
1502                NucleusError::GVisorError(format!(
1503                    "Failed to set rootfs directory permissions {:?}: {}",
1504                    rootfs, e
1505                ))
1506            })?;
1507
1508        // Write config.json
1509        let config_path = self.bundle_path.join("config.json");
1510        let config_json = serde_json::to_string_pretty(&self.config).map_err(|e| {
1511            NucleusError::GVisorError(format!("Failed to serialize OCI config: {}", e))
1512        })?;
1513
1514        let mut file = openat_file_nofollow(&bundle_dir, "config.json", 0o600).map_err(|e| {
1515            NucleusError::GVisorError(format!(
1516                "Failed to open config.json safely {:?}: {}",
1517                config_path, e
1518            ))
1519        })?;
1520        file.set_permissions(fs::Permissions::from_mode(0o600))
1521            .map_err(|e| {
1522                NucleusError::GVisorError(format!(
1523                    "Failed to set config.json permissions {:?}: {}",
1524                    config_path, e
1525                ))
1526            })?;
1527        file.write_all(config_json.as_bytes()).map_err(|e| {
1528            NucleusError::GVisorError(format!("Failed to write config.json: {}", e))
1529        })?;
1530        file.sync_all()
1531            .map_err(|e| NucleusError::GVisorError(format!("Failed to sync config.json: {}", e)))?;
1532
1533        debug!("Created OCI bundle structure at {:?}", self.bundle_path);
1534
1535        Ok(())
1536    }
1537
1538    /// Get the rootfs path
1539    pub fn rootfs_path(&self) -> PathBuf {
1540        self.bundle_path.join("rootfs")
1541    }
1542
1543    /// Get the bundle path
1544    pub fn bundle_path(&self) -> &Path {
1545        &self.bundle_path
1546    }
1547
1548    /// Clean up the bundle
1549    pub fn cleanup(&self) -> Result<()> {
1550        if self.bundle_path.exists() {
1551            fs::remove_dir_all(&self.bundle_path).map_err(|e| {
1552                NucleusError::GVisorError(format!("Failed to cleanup bundle: {}", e))
1553            })?;
1554            debug!("Cleaned up OCI bundle at {:?}", self.bundle_path);
1555        }
1556        Ok(())
1557    }
1558}
1559
1560#[cfg(test)]
1561mod tests {
1562    use super::*;
1563    use std::os::unix::fs::symlink;
1564    use tempfile::TempDir;
1565
1566    #[test]
1567    fn test_oci_config_new() {
1568        let config = OciConfig::new(vec!["/bin/sh".to_string()], Some("test".to_string()));
1569
1570        assert_eq!(config.oci_version, "1.0.2");
1571        assert_eq!(config.root.path, "rootfs");
1572        assert_eq!(config.process.args, vec!["/bin/sh"]);
1573        assert_eq!(config.hostname, Some("test".to_string()));
1574    }
1575
1576    #[test]
1577    fn test_oci_config_with_resources() {
1578        let limits = ResourceLimits::unlimited()
1579            .with_memory("512M")
1580            .unwrap()
1581            .with_cpu_cores(2.0)
1582            .unwrap();
1583
1584        let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_resources(&limits);
1585
1586        assert!(config.linux.is_some());
1587        let linux = config.linux.unwrap();
1588        assert!(linux.resources.is_some());
1589
1590        let resources = linux.resources.unwrap();
1591        assert!(resources.memory.is_some());
1592        assert!(resources.cpu.is_some());
1593    }
1594
1595    #[test]
1596    fn test_oci_bundle_create() {
1597        let temp_dir = TempDir::new().unwrap();
1598        let bundle_path = temp_dir.path().join("test-bundle");
1599
1600        let config = OciConfig::new(vec!["/bin/sh".to_string()], None);
1601        let bundle = OciBundle::new(bundle_path.clone(), config);
1602
1603        bundle.create().unwrap();
1604
1605        assert!(bundle_path.exists());
1606        assert!(bundle_path.join("rootfs").exists());
1607        assert!(bundle_path.join("config.json").exists());
1608
1609        bundle.cleanup().unwrap();
1610        assert!(!bundle_path.exists());
1611    }
1612
1613    #[test]
1614    fn test_oci_bundle_rejects_bundle_symlink() {
1615        let temp_dir = TempDir::new().unwrap();
1616        let bundle_path = temp_dir.path().join("test-bundle");
1617        let protected_host_dir = temp_dir.path().join("protected-host-dir");
1618
1619        fs::create_dir_all(&protected_host_dir).unwrap();
1620        fs::set_permissions(&protected_host_dir, fs::Permissions::from_mode(0o755)).unwrap();
1621        symlink(&protected_host_dir, &bundle_path).unwrap();
1622
1623        let config = OciConfig::new(vec!["/bin/sh".to_string()], None);
1624        let bundle = OciBundle::new(bundle_path.clone(), config);
1625
1626        let err = bundle.create().unwrap_err();
1627
1628        assert!(format!("{err}").contains("Failed to open bundle directory safely"));
1629        assert_eq!(
1630            fs::metadata(&protected_host_dir)
1631                .unwrap()
1632                .permissions()
1633                .mode()
1634                & 0o777,
1635            0o755
1636        );
1637        assert!(fs::symlink_metadata(&bundle_path)
1638            .unwrap()
1639            .file_type()
1640            .is_symlink());
1641    }
1642
1643    #[test]
1644    fn test_oci_bundle_rejects_rootfs_symlink() {
1645        let temp_dir = TempDir::new().unwrap();
1646        let bundle_path = temp_dir.path().join("test-bundle");
1647        let protected_host_dir = temp_dir.path().join("protected-host-dir");
1648
1649        fs::create_dir_all(&bundle_path).unwrap();
1650        fs::create_dir_all(&protected_host_dir).unwrap();
1651        fs::set_permissions(&protected_host_dir, fs::Permissions::from_mode(0o700)).unwrap();
1652        symlink(&protected_host_dir, bundle_path.join("rootfs")).unwrap();
1653
1654        let config = OciConfig::new(vec!["/bin/sh".to_string()], None);
1655        let bundle = OciBundle::new(bundle_path.clone(), config);
1656
1657        let err = bundle.create().unwrap_err();
1658
1659        assert!(format!("{err}").contains("Failed to open rootfs directory safely"));
1660        assert_eq!(
1661            fs::metadata(&protected_host_dir)
1662                .unwrap()
1663                .permissions()
1664                .mode()
1665                & 0o777,
1666            0o700
1667        );
1668        assert!(fs::symlink_metadata(bundle_path.join("rootfs"))
1669            .unwrap()
1670            .file_type()
1671            .is_symlink());
1672    }
1673
1674    #[test]
1675    fn test_oci_config_serialization() {
1676        let config = OciConfig::new(vec!["/bin/sh".to_string()], Some("test".to_string()));
1677
1678        let json = serde_json::to_string_pretty(&config).unwrap();
1679        assert!(json.contains("ociVersion"));
1680        assert!(json.contains("1.0.2"));
1681        assert!(json.contains("/bin/sh"));
1682
1683        // Test deserialization
1684        let deserialized: OciConfig = serde_json::from_str(&json).unwrap();
1685        assert_eq!(deserialized.oci_version, config.oci_version);
1686        assert_eq!(deserialized.process.args, config.process.args);
1687    }
1688
1689    #[test]
1690    fn test_host_runtime_binds_uses_fixed_paths_not_host_path() {
1691        // with_host_runtime_binds must NOT scan the host $PATH. Only standard
1692        // FHS paths should be bind-mounted to prevent leaking arbitrary host
1693        // directories into the container. Verify by setting a distinctive PATH
1694        // and checking that none of its entries appear in the resulting mounts.
1695        std::env::set_var("PATH", "/tmp/evil-inject-path/bin:/opt/attacker/sbin");
1696        let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_host_runtime_binds();
1697        let mount_dests: Vec<&str> = config
1698            .mounts
1699            .iter()
1700            .map(|m| m.destination.as_str())
1701            .collect();
1702        let mount_srcs: Vec<&str> = config.mounts.iter().map(|m| m.source.as_str()).collect();
1703        // Verify no mount references the injected PATH entries
1704        for path in &["/tmp/evil-inject-path", "/opt/attacker"] {
1705            assert!(
1706                !mount_dests.iter().any(|d| d.contains(path)),
1707                "with_host_runtime_binds must not use host $PATH – found {:?} in mount destinations",
1708                path
1709            );
1710            assert!(
1711                !mount_srcs.iter().any(|s| s.contains(path)),
1712                "with_host_runtime_binds must not use host $PATH – found {:?} in mount sources",
1713                path
1714            );
1715        }
1716        // Verify only standard FHS paths are mounted
1717        let allowed_prefixes = ["/bin", "/sbin", "/usr", "/lib", "/lib64", "/nix/store"];
1718        for mount in &config.mounts {
1719            if mount.mount_type == "bind" {
1720                assert!(
1721                    allowed_prefixes
1722                        .iter()
1723                        .any(|p| mount.destination.starts_with(p)),
1724                    "unexpected bind mount destination: {} – only FHS paths allowed",
1725                    mount.destination
1726                );
1727            }
1728        }
1729    }
1730
1731    #[test]
1732    fn test_volume_mounts_include_bind_and_tmpfs_options() {
1733        let tmp = tempfile::TempDir::new().unwrap();
1734        let config = OciConfig::new(vec!["/bin/sh".to_string()], None)
1735            .with_volume_mounts(&[
1736                crate::container::VolumeMount {
1737                    source: crate::container::VolumeSource::Bind {
1738                        source: tmp.path().to_path_buf(),
1739                    },
1740                    dest: std::path::PathBuf::from("/var/lib/app"),
1741                    read_only: true,
1742                },
1743                crate::container::VolumeMount {
1744                    source: crate::container::VolumeSource::Tmpfs {
1745                        size: Some("64M".to_string()),
1746                    },
1747                    dest: std::path::PathBuf::from("/var/cache/app"),
1748                    read_only: false,
1749                },
1750            ])
1751            .unwrap();
1752
1753        assert!(config.mounts.iter().any(|mount| {
1754            mount.destination == "/var/lib/app"
1755                && mount.mount_type == "bind"
1756                && mount.options.contains(&"ro".to_string())
1757        }));
1758        assert!(config.mounts.iter().any(|mount| {
1759            mount.destination == "/var/cache/app"
1760                && mount.mount_type == "tmpfs"
1761                && mount.options.contains(&"size=64M".to_string())
1762        }));
1763    }
1764
1765    #[test]
1766    fn test_volume_mounts_reject_sensitive_host_sources() {
1767        let err = OciConfig::new(vec!["/bin/sh".to_string()], None)
1768            .with_volume_mounts(&[crate::container::VolumeMount {
1769                source: crate::container::VolumeSource::Bind {
1770                    source: std::path::PathBuf::from("/proc/sys"),
1771                },
1772                dest: std::path::PathBuf::from("/host-proc"),
1773                read_only: true,
1774            }])
1775            .unwrap_err();
1776
1777        assert!(err.to_string().contains("sensitive host path"));
1778    }
1779
1780    #[test]
1781    fn test_volume_mounts_reject_reserved_destinations() {
1782        let tmp = tempfile::TempDir::new().unwrap();
1783        let err = OciConfig::new(vec!["/bin/sh".to_string()], None)
1784            .with_volume_mounts(&[crate::container::VolumeMount {
1785                source: crate::container::VolumeSource::Bind {
1786                    source: tmp.path().to_path_buf(),
1787                },
1788                dest: std::path::PathBuf::from("/usr/bin"),
1789                read_only: true,
1790            }])
1791            .unwrap_err();
1792
1793        assert!(err.to_string().contains("reserved"));
1794    }
1795
1796    #[test]
1797    fn test_oci_config_with_process_identity() {
1798        let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_process_identity(
1799            &crate::container::ProcessIdentity {
1800                uid: 1001,
1801                gid: 1002,
1802                additional_gids: vec![1003, 1004],
1803            },
1804        );
1805
1806        assert_eq!(config.process.user.uid, 1001);
1807        assert_eq!(config.process.user.gid, 1002);
1808        assert_eq!(config.process.user.additional_gids, Some(vec![1003, 1004]));
1809    }
1810
1811    #[test]
1812    fn test_oci_config_with_rlimits_uses_configured_memlock() {
1813        let limits = ResourceLimits::default()
1814            .with_pids(99)
1815            .unwrap()
1816            .with_memlock("8M")
1817            .unwrap();
1818
1819        let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_rlimits(&limits);
1820
1821        assert!(config.process.rlimits.iter().any(|limit| {
1822            limit.limit_type == "RLIMIT_NPROC" && limit.soft == 99 && limit.hard == 99
1823        }));
1824        assert!(config.process.rlimits.iter().any(|limit| {
1825            limit.limit_type == "RLIMIT_MEMLOCK"
1826                && limit.soft == 8 * 1024 * 1024
1827                && limit.hard == 8 * 1024 * 1024
1828        }));
1829    }
1830
1831    #[test]
1832    fn test_oci_config_with_rlimits_omits_nproc_when_unlimited() {
1833        let limits = ResourceLimits {
1834            pids_max: None,
1835            ..ResourceLimits::default()
1836        };
1837
1838        let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_rlimits(&limits);
1839
1840        assert!(
1841            !config
1842                .process
1843                .rlimits
1844                .iter()
1845                .any(|limit| limit.limit_type == "RLIMIT_NPROC"),
1846            "RLIMIT_NPROC must be omitted when pids_max is unlimited"
1847        );
1848    }
1849
1850    #[test]
1851    fn test_oci_config_uses_hardcoded_path_not_host() {
1852        // C-3: PATH must be a hardcoded minimal value, never the host's PATH.
1853        // This prevents leaking host filesystem layout into the container.
1854        std::env::set_var("PATH", "/nix/store/secret-hash/bin:/home/user/.local/bin");
1855        let config = OciConfig::new(vec!["/bin/sh".to_string()], None);
1856        let path_env = config
1857            .process
1858            .env
1859            .iter()
1860            .find(|e| e.starts_with("PATH="))
1861            .expect("PATH env must be set");
1862        assert_eq!(
1863            path_env, "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
1864            "OCI config must not leak host PATH"
1865        );
1866        assert!(
1867            !path_env.contains("/nix/store/secret"),
1868            "Host PATH must not leak into container"
1869        );
1870    }
1871
1872    #[test]
1873    fn test_oci_hooks_serialization_roundtrip() {
1874        let hooks = OciHooks {
1875            create_runtime: vec![OciHook {
1876                path: "/usr/bin/hook1".to_string(),
1877                args: vec!["hook1".to_string(), "--arg1".to_string()],
1878                env: vec!["FOO=bar".to_string()],
1879                timeout: Some(10),
1880            }],
1881            create_container: vec![],
1882            start_container: vec![],
1883            poststart: vec![OciHook {
1884                path: "/usr/bin/hook2".to_string(),
1885                args: vec![],
1886                env: vec![],
1887                timeout: None,
1888            }],
1889            poststop: vec![],
1890        };
1891
1892        let json = serde_json::to_string_pretty(&hooks).unwrap();
1893        assert!(json.contains("createRuntime"));
1894        assert!(json.contains("/usr/bin/hook1"));
1895        assert!(!json.contains("createContainer")); // empty vecs are skipped
1896
1897        let deserialized: OciHooks = serde_json::from_str(&json).unwrap();
1898        assert_eq!(deserialized.create_runtime.len(), 1);
1899        assert_eq!(deserialized.create_runtime[0].path, "/usr/bin/hook1");
1900        assert_eq!(deserialized.create_runtime[0].timeout, Some(10));
1901        assert_eq!(deserialized.poststart.len(), 1);
1902        assert!(deserialized.create_container.is_empty());
1903    }
1904
1905    #[test]
1906    fn test_oci_hooks_is_empty() {
1907        let empty = OciHooks::default();
1908        assert!(empty.is_empty());
1909
1910        let not_empty = OciHooks {
1911            poststop: vec![OciHook {
1912                path: "/bin/cleanup".to_string(),
1913                args: vec![],
1914                env: vec![],
1915                timeout: None,
1916            }],
1917            ..Default::default()
1918        };
1919        assert!(!not_empty.is_empty());
1920    }
1921
1922    #[test]
1923    fn test_oci_config_with_hooks() {
1924        let hooks = OciHooks {
1925            create_runtime: vec![OciHook {
1926                path: "/usr/bin/setup".to_string(),
1927                args: vec![],
1928                env: vec![],
1929                timeout: None,
1930            }],
1931            ..Default::default()
1932        };
1933
1934        let config = OciConfig::new(vec!["/bin/sh".to_string()], None).with_hooks(hooks);
1935        assert!(config.hooks.is_some());
1936
1937        let json = serde_json::to_string_pretty(&config).unwrap();
1938        assert!(json.contains("hooks"));
1939        assert!(json.contains("createRuntime"));
1940
1941        let deserialized: OciConfig = serde_json::from_str(&json).unwrap();
1942        assert!(deserialized.hooks.is_some());
1943        assert_eq!(deserialized.hooks.unwrap().create_runtime.len(), 1);
1944    }
1945
1946    #[test]
1947    fn test_oci_config_with_empty_hooks_serializes_without_hooks() {
1948        let config =
1949            OciConfig::new(vec!["/bin/sh".to_string()], None).with_hooks(OciHooks::default());
1950        assert!(config.hooks.is_none()); // empty hooks are set to None
1951
1952        let json = serde_json::to_string_pretty(&config).unwrap();
1953        assert!(!json.contains("hooks"));
1954    }
1955
1956    #[test]
1957    fn test_oci_hook_rejects_relative_path() {
1958        let hook = OciHook {
1959            path: "relative/path".to_string(),
1960            args: vec![],
1961            env: vec![],
1962            timeout: None,
1963        };
1964        let state = OciContainerState {
1965            oci_version: "1.0.2".to_string(),
1966            id: "test".to_string(),
1967            status: OciStatus::Creating,
1968            pid: 1234,
1969            bundle: "/tmp/bundle".to_string(),
1970        };
1971        let result = OciHooks::run_hooks(&[hook], &state, "test");
1972        assert!(result.is_err());
1973        let err_msg = result.unwrap_err().to_string();
1974        assert!(err_msg.contains("absolute"), "error: {}", err_msg);
1975    }
1976
1977    /// Read the original PATH from /proc/self/environ.
1978    ///
1979    /// Other tests in this module call `std::env::set_var("PATH", ...)` which
1980    /// corrupts the process environment. /proc/self/environ is frozen at
1981    /// process startup so it always reflects the real PATH.
1982    fn original_path() -> String {
1983        if let Ok(environ) = std::fs::read("/proc/self/environ") {
1984            for entry in environ.split(|&b| b == 0) {
1985                if let Ok(s) = std::str::from_utf8(entry) {
1986                    if let Some(val) = s.strip_prefix("PATH=") {
1987                        return val.to_string();
1988                    }
1989                }
1990            }
1991        }
1992        String::new()
1993    }
1994
1995    /// Resolve the absolute path to bash for test scripts.
1996    fn find_bash() -> String {
1997        let candidates = ["/bin/bash", "/usr/bin/bash"];
1998        for c in &candidates {
1999            if std::path::Path::new(c).exists() {
2000                return c.to_string();
2001            }
2002        }
2003        for dir in original_path().split(':') {
2004            let candidate = std::path::PathBuf::from(dir).join("bash");
2005            if candidate.exists() {
2006                return candidate.to_string_lossy().to_string();
2007            }
2008        }
2009        panic!("Cannot find bash binary for test");
2010    }
2011
2012    /// Write a script file with proper shebang and ensure it's fully flushed before execution.
2013    /// Embeds the original PATH so scripts can find utilities like `cat`/`touch`
2014    /// even when other tests have corrupted the process PATH.
2015    fn write_script(path: &std::path::Path, body: &str) {
2016        use std::io::Write as IoWrite;
2017        let bash = find_bash();
2018        let orig_path = original_path();
2019        let content = format!("#!{}\nexport PATH='{}'\n{}", bash, orig_path, body);
2020        let mut f = OpenOptions::new()
2021            .create(true)
2022            .truncate(true)
2023            .write(true)
2024            .mode(0o755)
2025            .open(path)
2026            .unwrap();
2027        f.write_all(content.as_bytes()).unwrap();
2028        f.sync_all().unwrap();
2029        drop(f);
2030    }
2031
2032    #[test]
2033    fn test_oci_hook_executes_successfully() {
2034        let temp_dir = TempDir::new().unwrap();
2035        let hook_script = temp_dir.path().join("hook.sh");
2036        let output_file = temp_dir.path().join("output.json");
2037
2038        write_script(
2039            &hook_script,
2040            &format!("cat > {}\n", output_file.to_string_lossy()),
2041        );
2042
2043        let hook = OciHook {
2044            path: hook_script.to_string_lossy().to_string(),
2045            args: vec![],
2046            env: vec![],
2047            timeout: Some(5),
2048        };
2049        let state = OciContainerState {
2050            oci_version: "1.0.2".to_string(),
2051            id: "test-container".to_string(),
2052            status: OciStatus::Creating,
2053            pid: 12345,
2054            bundle: "/tmp/test-bundle".to_string(),
2055        };
2056
2057        OciHooks::run_hooks(&[hook], &state, "createRuntime").unwrap();
2058
2059        // Verify the hook received the container state JSON on stdin
2060        let written = std::fs::read_to_string(&output_file).unwrap();
2061        let parsed: serde_json::Value = serde_json::from_str(&written).unwrap();
2062        assert_eq!(parsed["id"], "test-container");
2063        assert_eq!(parsed["pid"], 12345);
2064        assert_eq!(parsed["status"], "creating");
2065    }
2066
2067    #[test]
2068    fn test_oci_hook_retries_text_file_busy_spawn() {
2069        let temp_dir = TempDir::new().unwrap();
2070        let hook_script = temp_dir.path().join("hook.sh");
2071        let output_file = temp_dir.path().join("output.json");
2072
2073        write_script(
2074            &hook_script,
2075            &format!("cat > {}\n", output_file.to_string_lossy()),
2076        );
2077
2078        let (ready_tx, ready_rx) = std::sync::mpsc::channel();
2079        let busy_script = hook_script.clone();
2080        let busy_handle = std::thread::spawn(move || {
2081            let _busy_file = OpenOptions::new().write(true).open(&busy_script).unwrap();
2082            ready_tx.send(()).unwrap();
2083            std::thread::sleep(std::time::Duration::from_millis(100));
2084        });
2085        ready_rx.recv().unwrap();
2086
2087        let hook = OciHook {
2088            path: hook_script.to_string_lossy().to_string(),
2089            args: vec![],
2090            env: vec![],
2091            timeout: Some(5),
2092        };
2093        let state = OciContainerState {
2094            oci_version: "1.0.2".to_string(),
2095            id: "test-container".to_string(),
2096            status: OciStatus::Creating,
2097            pid: 12345,
2098            bundle: "/tmp/test-bundle".to_string(),
2099        };
2100
2101        let result = OciHooks::run_hooks(&[hook], &state, "createRuntime");
2102        busy_handle.join().unwrap();
2103        result.unwrap();
2104
2105        let written = std::fs::read_to_string(&output_file).unwrap();
2106        let parsed: serde_json::Value = serde_json::from_str(&written).unwrap();
2107        assert_eq!(parsed["id"], "test-container");
2108    }
2109
2110    #[test]
2111    fn test_oci_hook_nonzero_exit_is_error() {
2112        let temp_dir = TempDir::new().unwrap();
2113        let hook_script = temp_dir.path().join("fail.sh");
2114        write_script(&hook_script, "exit 1\n");
2115
2116        let hook = OciHook {
2117            path: hook_script.to_string_lossy().to_string(),
2118            args: vec![],
2119            env: vec![],
2120            timeout: Some(5),
2121        };
2122        let state = OciContainerState {
2123            oci_version: "1.0.2".to_string(),
2124            id: "test".to_string(),
2125            status: OciStatus::Creating,
2126            pid: 1,
2127            bundle: "".to_string(),
2128        };
2129
2130        let result = OciHooks::run_hooks(&[hook], &state, "test");
2131        assert!(result.is_err());
2132        assert!(result
2133            .unwrap_err()
2134            .to_string()
2135            .contains("exited with status"));
2136    }
2137
2138    #[test]
2139    fn test_oci_hooks_best_effort_continues_on_failure() {
2140        let temp_dir = TempDir::new().unwrap();
2141        let fail_script = temp_dir.path().join("fail.sh");
2142        write_script(&fail_script, "exit 1\n");
2143
2144        let marker = temp_dir.path().join("ran");
2145        let ok_script = temp_dir.path().join("ok.sh");
2146        write_script(&ok_script, &format!("touch {}\n", marker.to_string_lossy()));
2147
2148        let hooks = vec![
2149            OciHook {
2150                path: fail_script.to_string_lossy().to_string(),
2151                args: vec![],
2152                env: vec![],
2153                timeout: Some(5),
2154            },
2155            OciHook {
2156                path: ok_script.to_string_lossy().to_string(),
2157                args: vec![],
2158                env: vec![],
2159                timeout: Some(5),
2160            },
2161        ];
2162        let state = OciContainerState {
2163            oci_version: "1.0.2".to_string(),
2164            id: "test".to_string(),
2165            status: OciStatus::Stopped,
2166            pid: 0,
2167            bundle: "".to_string(),
2168        };
2169
2170        // best_effort should not panic or return error
2171        OciHooks::run_hooks_best_effort(&hooks, &state, "poststop");
2172        // Second hook should have run despite first failing
2173        assert!(marker.exists(), "second hook should run after first fails");
2174    }
2175}