Skip to main content

zlayer_agent/
bundle.rs

1//! OCI Bundle Creation
2//!
3//! Creates OCI-compliant bundles for container runtimes using libcontainer (youki).
4//! A bundle consists of a directory with:
5//! - config.json: OCI runtime specification
6//! - rootfs/: Container filesystem (symlink or bind mount target)
7
8use crate::cdi::{self, CdiContainerEdits, CdiRegistry};
9use crate::error::{AgentError, Result};
10use crate::runtime::ContainerId;
11use oci_spec::runtime::{
12    Capability, Hook, HookBuilder, Hooks, HooksBuilder, LinuxBuilder, LinuxCapabilitiesBuilder,
13    LinuxCpuBuilder, LinuxDeviceBuilder, LinuxDeviceCgroupBuilder, LinuxDeviceType,
14    LinuxMemoryBuilder, LinuxNamespaceBuilder, LinuxNamespaceType, LinuxResourcesBuilder, Mount,
15    MountBuilder, ProcessBuilder, RootBuilder, Spec, SpecBuilder, UserBuilder,
16};
17// `LinuxIdMappingBuilder` is only used by the unix-gated rootless user-namespace
18// helpers below; importing it unconditionally trips dead-code lints on Windows.
19#[cfg(unix)]
20use oci_spec::runtime::LinuxIdMappingBuilder;
21use std::collections::{HashMap, HashSet};
22// `MetadataExt` is only meaningful on Unix-like hosts where `/dev/*` nodes exist
23// and have major/minor numbers. On Windows this module is still built so that
24// `BundleBuilder::build_spec_only` (cross-platform OCI Spec generation) can be
25// called from the WSL2 delegate runtime, which then pipes the generated
26// `config.json` into a Linux WSL2 distro that owns the actual device
27// fingerprint. See G-1 / G-2 in the Windows plan. The import is performed
28// inside `get_device_major_minor` itself to avoid an unused-import warning on
29// non-Unix platforms.
30use std::path::{Path, PathBuf};
31use std::str::FromStr;
32use std::sync::Arc;
33use tokio::fs;
34use zlayer_secrets::SecretsProvider;
35use zlayer_spec::{GpuSharingMode, ServiceSpec, StorageSpec, StorageTier};
36
37/// Default host directory for the NVIDIA MPS control pipe when the spec
38/// doesn't override [`zlayer_spec::GpuSpec::mps_pipe_dir`].
39const DEFAULT_MPS_PIPE_DIR: &str = "/tmp/nvidia-mps";
40
41/// Default host directory for NVIDIA MPS log output when the spec doesn't
42/// override [`zlayer_spec::GpuSpec::mps_log_dir`].
43const DEFAULT_MPS_LOG_DIR: &str = "/tmp/nvidia-log";
44
45/// Container path where a host-supplied NVIDIA time-slicing config YAML is
46/// surfaced (read-only). The file is informational — `ZLayer` doesn't interpret
47/// it; tools running inside the container can read it to discover slice
48/// topology.
49const TIMESLICE_CONFIG_CONTAINER_PATH: &str = "/etc/nvidia/gpu-time-slicing.yaml";
50
51/// Resolved MPS host directories (pipe + log), validated to exist on disk.
52///
53/// Returned by [`resolve_mps_dirs`] only when `GpuSpec.sharing == Mps`. Both
54/// paths are absolute and guaranteed to be directories at the time the
55/// helper ran — callers can bind-mount them directly.
56struct MpsDirs {
57    pipe_dir: PathBuf,
58    log_dir: PathBuf,
59}
60
61/// Resolve and validate the MPS pipe / log directories for a GPU spec.
62///
63/// Returns `Ok(None)` when sharing is not MPS (or absent), `Ok(Some(...))`
64/// when both directories exist on the host, or
65/// [`AgentError::GpuSharingUnavailable`] when either directory is missing.
66///
67/// Defaults to [`DEFAULT_MPS_PIPE_DIR`] / [`DEFAULT_MPS_LOG_DIR`] when the
68/// spec omits explicit paths, matching the convention used by
69/// `nvidia-cuda-mps-control` out of the box.
70fn resolve_mps_dirs(gpu: &zlayer_spec::GpuSpec) -> Result<Option<MpsDirs>> {
71    if gpu.sharing != Some(GpuSharingMode::Mps) {
72        return Ok(None);
73    }
74
75    let pipe_dir = PathBuf::from(gpu.mps_pipe_dir.as_deref().unwrap_or(DEFAULT_MPS_PIPE_DIR));
76    let log_dir = PathBuf::from(gpu.mps_log_dir.as_deref().unwrap_or(DEFAULT_MPS_LOG_DIR));
77
78    if !pipe_dir.is_dir() {
79        return Err(AgentError::GpuSharingUnavailable {
80            mode: "mps".to_string(),
81            reason: format!(
82                "MPS pipe directory {} does not exist; ensure nvidia-cuda-mps-control is running",
83                pipe_dir.display()
84            ),
85        });
86    }
87    if !log_dir.is_dir() {
88        return Err(AgentError::GpuSharingUnavailable {
89            mode: "mps".to_string(),
90            reason: format!(
91                "MPS log directory {} does not exist; ensure nvidia-cuda-mps-control is running",
92                log_dir.display()
93            ),
94        });
95    }
96
97    Ok(Some(MpsDirs { pipe_dir, log_dir }))
98}
99
100/// Convert a CDI device node descriptor into the OCI [`LinuxDevice`] used by
101/// the runtime.
102///
103/// CDI device nodes may omit `type`, `major`, and `minor` — in that case we
104/// probe the host (via `get_device_type` / `get_device_major_minor`) using
105/// the resolved host path, falling back to character device with zero
106/// major/minor when the file is unavailable (typical for test fixtures
107/// that reference paths that don't exist on the build host).
108fn cdi_node_to_oci_device(
109    node: &crate::cdi::CdiDeviceNode,
110) -> Result<oci_spec::runtime::LinuxDevice> {
111    let host_path = node.host_path.as_deref().unwrap_or(&node.path);
112
113    let dev_type = match node.device_type.as_deref() {
114        Some("c" | "u") => LinuxDeviceType::C,
115        Some("b") => LinuxDeviceType::B,
116        Some("p") => LinuxDeviceType::P,
117        _ => get_device_type(host_path).unwrap_or(LinuxDeviceType::C),
118    };
119
120    let (major, minor) = if let (Some(maj), Some(min)) = (node.major, node.minor) {
121        (maj, min)
122    } else {
123        get_device_major_minor(host_path).unwrap_or((0, 0))
124    };
125
126    let mut builder = LinuxDeviceBuilder::default()
127        .path(node.path.clone())
128        .typ(dev_type)
129        .major(major)
130        .minor(minor);
131    if let Some(mode) = node.file_mode {
132        builder = builder.file_mode(mode);
133    } else {
134        builder = builder.file_mode(0o666u32);
135    }
136    builder = builder.uid(node.uid.unwrap_or(0));
137    builder = builder.gid(node.gid.unwrap_or(0));
138
139    builder.build().map_err(|e| {
140        AgentError::InvalidSpec(format!(
141            "failed to build CDI device {path}: {e}",
142            path = node.path
143        ))
144    })
145}
146
147/// Convert a CDI hook descriptor into the OCI [`Hook`] used by the runtime.
148fn convert_cdi_hook(cdi_hook: &crate::cdi::CdiHook) -> Result<Hook> {
149    let mut builder = HookBuilder::default().path(PathBuf::from(&cdi_hook.path));
150    if !cdi_hook.args.is_empty() {
151        builder = builder.args(cdi_hook.args.clone());
152    }
153    if !cdi_hook.env.is_empty() {
154        builder = builder.env(cdi_hook.env.clone());
155    }
156    builder
157        .build()
158        .map_err(|e| AgentError::InvalidSpec(format!("failed to build CDI hook: {e}")))
159}
160
161/// All Linux capabilities for privileged mode
162const ALL_CAPABILITIES: &[Capability] = &[
163    Capability::AuditControl,
164    Capability::AuditRead,
165    Capability::AuditWrite,
166    Capability::BlockSuspend,
167    Capability::Bpf,
168    Capability::CheckpointRestore,
169    Capability::Chown,
170    Capability::DacOverride,
171    Capability::DacReadSearch,
172    Capability::Fowner,
173    Capability::Fsetid,
174    Capability::IpcLock,
175    Capability::IpcOwner,
176    Capability::Kill,
177    Capability::Lease,
178    Capability::LinuxImmutable,
179    Capability::MacAdmin,
180    Capability::MacOverride,
181    Capability::Mknod,
182    Capability::NetAdmin,
183    Capability::NetBindService,
184    Capability::NetBroadcast,
185    Capability::NetRaw,
186    Capability::Perfmon,
187    Capability::Setfcap,
188    Capability::Setgid,
189    Capability::Setpcap,
190    Capability::Setuid,
191    Capability::SysAdmin,
192    Capability::SysBoot,
193    Capability::SysChroot,
194    Capability::SysModule,
195    Capability::SysNice,
196    Capability::SysPacct,
197    Capability::SysPtrace,
198    Capability::SysRawio,
199    Capability::SysResource,
200    Capability::SysTime,
201    Capability::SysTtyConfig,
202    Capability::Syslog,
203    Capability::WakeAlarm,
204];
205
206/// Parse memory string like "512Mi", "1Gi" to bytes
207///
208/// Supports both IEC (binary) and SI (decimal) units:
209/// - IEC: Ki, Mi, Gi, Ti (powers of 1024)
210/// - SI: K/k, M/m, G/g, T/t (powers of 1000)
211/// - No suffix: bytes
212///
213/// # Examples
214/// ```ignore
215/// assert_eq!(parse_memory_string("512Mi").unwrap(), 512 * 1024 * 1024);
216/// assert_eq!(parse_memory_string("1Gi").unwrap(), 1024 * 1024 * 1024);
217/// assert_eq!(parse_memory_string("2G").unwrap(), 2 * 1000 * 1000 * 1000);
218/// ```
219///
220/// # Errors
221/// Returns an error if the string cannot be parsed as a memory size.
222pub fn parse_memory_string(s: &str) -> std::result::Result<u64, String> {
223    let s = s.trim();
224    if s.is_empty() {
225        return Err("empty memory string".to_string());
226    }
227
228    let (num_str, multiplier) = if let Some(n) = s.strip_suffix("Ki") {
229        (n, 1024u64)
230    } else if let Some(n) = s.strip_suffix("Mi") {
231        (n, 1024u64 * 1024)
232    } else if let Some(n) = s.strip_suffix("Gi") {
233        (n, 1024u64 * 1024 * 1024)
234    } else if let Some(n) = s.strip_suffix("Ti") {
235        (n, 1024u64 * 1024 * 1024 * 1024)
236    } else if let Some(n) = s.strip_suffix('K').or_else(|| s.strip_suffix('k')) {
237        (n, 1000u64)
238    } else if let Some(n) = s.strip_suffix('M').or_else(|| s.strip_suffix('m')) {
239        (n, 1000u64 * 1000)
240    } else if let Some(n) = s.strip_suffix('G').or_else(|| s.strip_suffix('g')) {
241        (n, 1000u64 * 1000 * 1000)
242    } else if let Some(n) = s.strip_suffix('T').or_else(|| s.strip_suffix('t')) {
243        (n, 1000u64 * 1000 * 1000 * 1000)
244    } else {
245        (s, 1u64)
246    };
247
248    let num: u64 = num_str
249        .parse()
250        .map_err(|e| format!("invalid number: {e}"))?;
251
252    Ok(num * multiplier)
253}
254
255/// Get major and minor device numbers from a device path
256///
257/// Unix-only: relies on `MetadataExt::rdev()` which isn't available on Windows.
258/// When `bundle.rs` is compiled for a Windows host (for the WSL2 delegate's
259/// cross-platform `build_spec_only` path), device probing is skipped entirely —
260/// the Linux side of the delegate is responsible for its own device fingerprint.
261/// The non-Unix stub below returns `Unsupported` so the `if let Ok(..)` /
262/// `.unwrap_or(..)` call sites at the CDI / GPU passthrough paths skip cleanly.
263#[cfg(unix)]
264#[allow(clippy::cast_possible_wrap)]
265fn get_device_major_minor(path: &str) -> std::io::Result<(i64, i64)> {
266    use std::os::unix::fs::MetadataExt;
267    let metadata = std::fs::metadata(path)?;
268    let rdev = metadata.rdev();
269    // Major is upper 8 bits (after shifting), minor is lower 8 bits
270    let major = ((rdev >> 8) & 0xff) as i64;
271    let minor = (rdev & 0xff) as i64;
272    Ok((major, minor))
273}
274
275/// Non-Unix stub: device-cgroup probes require Unix; callers use `if let Ok(..)` to skip.
276#[cfg(not(unix))]
277fn get_device_major_minor(_path: &str) -> std::io::Result<(i64, i64)> {
278    Err(std::io::Error::new(
279        std::io::ErrorKind::Unsupported,
280        "device-cgroup probes require Unix",
281    ))
282}
283
284/// Detect device type from path
285///
286/// Unix-only: uses `FileTypeExt::is_char_device` / `is_block_device` which are
287/// not available on Windows. See `get_device_major_minor` for the rationale.
288#[cfg(unix)]
289fn get_device_type(path: &str) -> std::io::Result<LinuxDeviceType> {
290    use std::os::unix::fs::FileTypeExt;
291    let metadata = std::fs::metadata(path)?;
292    let file_type = metadata.file_type();
293    if file_type.is_char_device() {
294        Ok(LinuxDeviceType::C)
295    } else if file_type.is_block_device() {
296        Ok(LinuxDeviceType::B)
297    } else {
298        Ok(LinuxDeviceType::U) // Unknown/other
299    }
300}
301
302/// Non-Unix stub: device-cgroup probes require Unix; callers use `.unwrap_or(..)` to skip.
303#[cfg(not(unix))]
304fn get_device_type(_path: &str) -> std::io::Result<LinuxDeviceType> {
305    Err(std::io::Error::new(
306        std::io::ErrorKind::Unsupported,
307        "device-cgroup probes require Unix",
308    ))
309}
310
311/// Builder for OCI container bundles
312///
313/// Creates the directory structure and config.json required for OCI-compliant
314/// container runtimes like runc or youki.
315///
316/// # Example
317/// ```ignore
318/// let dirs = zlayer_paths::ZLayerDirs::system_default();
319/// let builder = BundleBuilder::new(dirs.bundles().join("mycontainer"))
320///     .with_rootfs(dirs.rootfs().join("myimage"));
321///
322/// let bundle_path = builder.build(&container_id, &service_spec).await?;
323/// ```
324#[derive(Clone)]
325pub struct BundleBuilder {
326    /// Base directory for the bundle
327    bundle_dir: PathBuf,
328    /// Path to the unpacked rootfs (from image layers)
329    rootfs_path: Option<PathBuf>,
330    /// Custom hostname (defaults to container ID)
331    hostname: Option<String>,
332    /// Additional environment variables
333    extra_env: Vec<(String, String)>,
334    /// Custom working directory
335    cwd: Option<String>,
336    /// Custom command/args to run (overrides image default)
337    args: Option<Vec<String>>,
338    /// Pre-resolved volume paths from `StorageManager`
339    volume_paths: HashMap<String, PathBuf>,
340    /// Image configuration from the OCI registry (entrypoint, cmd, env, workdir, user)
341    image_config: Option<zlayer_registry::ImageConfig>,
342    /// Use host networking (skip Network namespace, container shares host network)
343    host_network: bool,
344    /// Secrets provider for resolving $S: prefixed env vars
345    secrets_provider: Option<Arc<dyn SecretsProvider>>,
346    /// Deployment scope for secret lookups (e.g., deployment name)
347    deployment_scope: Option<String>,
348    /// Host-side Unix socket path to bind-mount into the container
349    socket_path: Option<String>,
350    /// Optional CDI registry override (defaults to discovery from system paths).
351    ///
352    /// Wrapped in `Arc` so [`BundleBuilder`] can stay [`Clone`]. Primarily set
353    /// in tests via [`BundleBuilder::with_cdi_registry`]; production paths
354    /// leave this `None` and lazy-discover via [`CdiRegistry::discover`] when
355    /// a `GpuSpec` is present.
356    cdi_registry: Option<Arc<CdiRegistry>>,
357}
358
359impl std::fmt::Debug for BundleBuilder {
360    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
361        f.debug_struct("BundleBuilder")
362            .field("bundle_dir", &self.bundle_dir)
363            .field("rootfs_path", &self.rootfs_path)
364            .field("hostname", &self.hostname)
365            .field("extra_env", &self.extra_env)
366            .field("cwd", &self.cwd)
367            .field("args", &self.args)
368            .field("volume_paths", &self.volume_paths)
369            .field("image_config", &self.image_config)
370            .field("host_network", &self.host_network)
371            .field("secrets_provider", &self.secrets_provider.is_some())
372            .field("deployment_scope", &self.deployment_scope)
373            .field("socket_path", &self.socket_path)
374            .field("cdi_registry", &self.cdi_registry.is_some())
375            .finish()
376    }
377}
378
379/// Build OCI `uid_mappings` (or `gid_mappings` — same structure) for a rootless
380/// container. Always emits a single-id mapping (container 0 → `host_id`, size 1).
381/// If `username` has an entry in `subid_path` (e.g. /etc/subuid), appends a
382/// range mapping (container 1 → range start, size = range count).
383///
384/// Rootless user-namespace mapping is a Linux/libcontainer concept; on Windows
385/// containers run via HCS so this helper is unix-only.
386#[cfg(unix)]
387fn build_rootless_id_mappings(
388    host_id: u32,
389    subid_path: &str,
390    username: &str,
391) -> Vec<oci_spec::runtime::LinuxIdMapping> {
392    let mut mappings = vec![LinuxIdMappingBuilder::default()
393        .container_id(0_u32)
394        .host_id(host_id)
395        .size(1_u32)
396        .build()
397        .unwrap()];
398    if !username.is_empty() {
399        if let Some((start, count)) = read_subid_range(subid_path, username) {
400            mappings.push(
401                LinuxIdMappingBuilder::default()
402                    .container_id(1_u32)
403                    .host_id(start)
404                    .size(count)
405                    .build()
406                    .unwrap(),
407            );
408        }
409    }
410    mappings
411}
412
413/// Read /etc/subuid (or /etc/subgid) and return the (start, count) range
414/// allocated to the given username, if any. Returns None on any I/O error
415/// or when the user has no entry — callers must fall back to a single-id
416/// mapping in that case.
417///
418/// Subuid files are a Linux concept and the only caller is the unix-gated
419/// `build_rootless_id_mappings`, so this helper is unix-only as well.
420#[cfg(unix)]
421fn read_subid_range(path: &str, username: &str) -> Option<(u32, u32)> {
422    let contents = std::fs::read_to_string(path).ok()?;
423    for line in contents.lines() {
424        let mut parts = line.splitn(3, ':');
425        let user = parts.next()?;
426        if user != username {
427            continue;
428        }
429        let start: u32 = parts.next()?.parse().ok()?;
430        let count: u32 = parts.next()?.parse().ok()?;
431        return Some((start, count));
432    }
433    None
434}
435
436impl BundleBuilder {
437    /// Create a new `BundleBuilder` with the specified bundle directory
438    ///
439    /// The bundle directory will be created if it doesn't exist.
440    /// The structure will be:
441    /// ```text
442    /// {bundle_dir}/
443    /// ├── config.json
444    /// └── rootfs/  (symlink to actual rootfs or mount point)
445    /// ```
446    #[must_use]
447    pub fn new(bundle_dir: PathBuf) -> Self {
448        Self {
449            bundle_dir,
450            rootfs_path: None,
451            hostname: None,
452            extra_env: Vec::new(),
453            cwd: None,
454            args: None,
455            volume_paths: HashMap::new(),
456            image_config: None,
457            host_network: false,
458            secrets_provider: None,
459            deployment_scope: None,
460            socket_path: None,
461            cdi_registry: None,
462        }
463    }
464
465    /// Override the CDI registry used for GPU device resolution.
466    ///
467    /// When unset, [`build_oci_spec`](Self::build_oci_spec) discovers CDI
468    /// specs lazily from the standard system search paths (`/etc/cdi`,
469    /// `/var/run/cdi`, plus `$CDI_SPEC_DIRS`). Tests use this setter to
470    /// inject fixture-backed registries pointed at a temp directory.
471    #[must_use]
472    pub fn with_cdi_registry(mut self, registry: Arc<CdiRegistry>) -> Self {
473        self.cdi_registry = Some(registry);
474        self
475    }
476
477    /// Create a `BundleBuilder` for a container in the default bundle location
478    #[must_use]
479    pub fn for_container(container_id: &ContainerId) -> Self {
480        let bundle_dir = zlayer_paths::ZLayerDirs::system_default()
481            .bundles()
482            .join(container_id.to_string());
483        Self::new(bundle_dir)
484    }
485
486    /// Set the rootfs path (from unpacked image layers)
487    ///
488    /// This path will be symlinked into the bundle as `rootfs/`
489    #[must_use]
490    pub fn with_rootfs(mut self, rootfs_path: PathBuf) -> Self {
491        self.rootfs_path = Some(rootfs_path);
492        self
493    }
494
495    /// Set a custom hostname for the container
496    #[must_use]
497    pub fn with_hostname(mut self, hostname: String) -> Self {
498        self.hostname = Some(hostname);
499        self
500    }
501
502    /// Add extra environment variables
503    #[must_use]
504    pub fn with_env(mut self, key: String, value: String) -> Self {
505        self.extra_env.push((key, value));
506        self
507    }
508
509    /// Set the working directory
510    #[must_use]
511    pub fn with_cwd(mut self, cwd: String) -> Self {
512        self.cwd = Some(cwd);
513        self
514    }
515
516    /// Set the command/args to run
517    #[must_use]
518    pub fn with_args(mut self, args: Vec<String>) -> Self {
519        self.args = Some(args);
520        self
521    }
522
523    /// Set pre-resolved volume paths from `StorageManager`
524    ///
525    /// These are used to map named/anonymous/S3 volumes to their host paths
526    /// when building storage mounts in the OCI spec.
527    #[must_use]
528    pub fn with_volume_paths(mut self, volume_paths: HashMap<String, PathBuf>) -> Self {
529        self.volume_paths = volume_paths;
530        self
531    }
532
533    /// Set the OCI image configuration (entrypoint, cmd, env, workdir, user)
534    ///
535    /// When set, the image config provides defaults for the container process
536    /// that are used when the deployment spec doesn't override them.
537    #[must_use]
538    pub fn with_image_config(mut self, config: zlayer_registry::ImageConfig) -> Self {
539        self.image_config = Some(config);
540        self
541    }
542
543    /// Enable host networking mode
544    ///
545    /// When true, the container will NOT get its own network namespace and will
546    /// share the host's network stack. This is equivalent to Docker's `--network host`.
547    /// Use this when overlay networking is unavailable or not desired.
548    #[must_use]
549    pub fn with_host_network(mut self, host_network: bool) -> Self {
550        self.host_network = host_network;
551        self
552    }
553
554    /// Set the secrets provider for resolving `$S:` prefixed environment variables
555    ///
556    /// When set, environment variables with `$S:secret-name` syntax will be resolved
557    /// from this provider at bundle creation time.
558    #[must_use]
559    pub fn with_secrets_provider(mut self, provider: Arc<dyn SecretsProvider>) -> Self {
560        self.secrets_provider = Some(provider);
561        self
562    }
563
564    /// Set the deployment scope for secret lookups
565    ///
566    /// This is typically the deployment name and is used as the scope when
567    /// resolving `$S:` prefixed environment variables.
568    #[must_use]
569    pub fn with_deployment_scope(mut self, scope: String) -> Self {
570        self.deployment_scope = Some(scope);
571        self
572    }
573
574    /// Set a host-side Unix socket path to bind-mount into the container at
575    /// the default `ZLayer` socket path (read-only).
576    #[must_use]
577    pub fn with_socket_mount(mut self, path: impl Into<String>) -> Self {
578        self.socket_path = Some(path.into());
579        self
580    }
581
582    /// Get the bundle directory path
583    #[must_use]
584    pub fn bundle_dir(&self) -> &Path {
585        &self.bundle_dir
586    }
587
588    /// Build the OCI bundle from a `ServiceSpec`
589    ///
590    /// Creates the bundle directory structure and generates config.json
591    /// based on the provided service specification.
592    ///
593    /// # Returns
594    /// The path to the bundle directory on success
595    ///
596    /// # Errors
597    /// - `AgentError::CreateFailed` if directory creation fails
598    /// - `AgentError::InvalidSpec` if the OCI spec generation fails
599    ///
600    /// # Platform
601    /// Unix-only. Uses `tokio::fs::symlink` which is defined in terms of
602    /// `std::os::unix::fs::symlink` and does not exist on Windows. The Windows
603    /// WSL2 delegate path should call [`BundleBuilder::build_spec_only`] to
604    /// obtain the OCI [`Spec`] and pipe it into the WSL2 distro, where the
605    /// Linux side of the delegate handles bundle directory creation.
606    #[cfg(unix)]
607    pub async fn build(&self, container_id: &ContainerId, spec: &ServiceSpec) -> Result<PathBuf> {
608        // Create bundle directory
609        fs::create_dir_all(&self.bundle_dir)
610            .await
611            .map_err(|e| AgentError::CreateFailed {
612                id: container_id.to_string(),
613                reason: format!("failed to create bundle directory: {e}"),
614            })?;
615
616        // Set up rootfs (symlink or create empty directory)
617        let rootfs_in_bundle = self.bundle_dir.join("rootfs");
618        if let Some(ref rootfs_path) = self.rootfs_path {
619            // Remove existing rootfs symlink/dir if present
620            let _ = fs::remove_file(&rootfs_in_bundle).await;
621            let _ = fs::remove_dir(&rootfs_in_bundle).await;
622
623            // Create symlink to actual rootfs.
624            // On Unix: `tokio::fs::symlink` (unified file/dir symlink).
625            // On Windows: `tokio::fs::symlink_dir` (wraps CreateSymbolicLinkW with
626            // SYMBOLIC_LINK_FLAG_DIRECTORY) — rootfs is always an OCI layer directory.
627            #[cfg(unix)]
628            tokio::fs::symlink(rootfs_path, &rootfs_in_bundle)
629                .await
630                .map_err(|e| AgentError::CreateFailed {
631                    id: container_id.to_string(),
632                    reason: format!(
633                        "failed to symlink rootfs from {} to {}: {}",
634                        rootfs_path.display(),
635                        rootfs_in_bundle.display(),
636                        e
637                    ),
638                })?;
639
640            #[cfg(windows)]
641            tokio::fs::symlink_dir(rootfs_path, &rootfs_in_bundle)
642                .await
643                .map_err(|e| AgentError::CreateFailed {
644                    id: container_id.to_string(),
645                    reason: format!(
646                        "failed to symlink rootfs from {} to {}: {}",
647                        rootfs_path.display(),
648                        rootfs_in_bundle.display(),
649                        e
650                    ),
651                })?;
652        } else {
653            // Create empty rootfs directory (for bind mounts)
654            fs::create_dir_all(&rootfs_in_bundle)
655                .await
656                .map_err(|e| AgentError::CreateFailed {
657                    id: container_id.to_string(),
658                    reason: format!("failed to create rootfs directory: {e}"),
659                })?;
660        }
661
662        // Generate OCI runtime spec
663        let oci_spec = self
664            .build_spec_only(container_id, spec, &self.volume_paths)
665            .await?;
666
667        // Write config.json
668        let config_path = self.bundle_dir.join("config.json");
669        let config_json =
670            serde_json::to_string_pretty(&oci_spec).map_err(|e| AgentError::CreateFailed {
671                id: container_id.to_string(),
672                reason: format!("failed to serialize OCI spec: {e}"),
673            })?;
674
675        fs::write(&config_path, config_json)
676            .await
677            .map_err(|e| AgentError::CreateFailed {
678                id: container_id.to_string(),
679                reason: format!("failed to write config.json: {e}"),
680            })?;
681
682        tracing::debug!(
683            "Created OCI bundle at {} for container {}",
684            self.bundle_dir.display(),
685            container_id
686        );
687
688        Ok(self.bundle_dir.clone())
689    }
690
691    /// Render the OCI runtime spec without creating a bundle directory
692    /// or writing `config.json`.
693    ///
694    /// This is the cross-platform entry point for OCI spec generation and is
695    /// the only bundle-builder method that is callable on Windows. Used by the
696    /// WSL2 delegate runtime (`runtimes/wsl2_delegate.rs`): the Windows host
697    /// renders the spec, then streams the JSON into the WSL distro filesystem
698    /// where `youki` will consume it. The bundle path passed to
699    /// `BundleBuilder::new` is purely informational in that flow; this method
700    /// never touches the filesystem.
701    ///
702    /// Unix hosts that want both the spec *and* the on-disk bundle layout
703    /// (rootfs symlink, `config.json`, parent directories) should continue to
704    /// use [`BundleBuilder::build`] or [`BundleBuilder::write_config`].
705    ///
706    /// # Errors
707    /// Returns [`AgentError::InvalidSpec`] if any of the OCI `*Builder` types
708    /// reject the configuration, or if environment-variable secret resolution
709    /// fails.
710    pub async fn build_spec_only(
711        &self,
712        container_id: &ContainerId,
713        spec: &ServiceSpec,
714        volume_paths: &std::collections::HashMap<String, PathBuf>,
715    ) -> Result<oci_spec::runtime::Spec> {
716        self.build_oci_spec(container_id, spec, volume_paths).await
717    }
718
719    /// Resolve CDI edits for a service spec's GPU request, if any.
720    ///
721    /// Returns:
722    /// - `Ok(None)` when the spec has no `GpuSpec`, when the vendor isn't a
723    ///   known CDI-published kind (e.g. `"apple"`), or when no explicit
724    ///   registry was set and lazy discovery turned up no installed specs
725    ///   (production fallback — baked-in defaults take over).
726    /// - `Ok(Some(vec))` with one entry per requested device when CDI specs
727    ///   are available and resolution succeeds.
728    /// - `Err(AgentError::InvalidSpec(...))` when the caller explicitly opted
729    ///   into CDI (via `with_cdi_registry`) but the resolution fails —
730    ///   surfaces [`cdi::CdiError::SpecMissing`] /
731    ///   [`cdi::CdiError::DeviceMissing`] / [`cdi::CdiError::NoDevices`] as
732    ///   actionable strings.
733    fn resolve_cdi_edits(&self, spec: &ServiceSpec) -> Result<Option<Vec<CdiContainerEdits>>> {
734        let Some(ref gpu) = spec.resources.gpu else {
735            return Ok(None);
736        };
737
738        // Map short vendor to CDI kind. Unknown vendors (e.g. "apple") fall
739        // back to baked-in behavior.
740        let Some(kind) = cdi::vendor_to_cdi_kind(&gpu.vendor) else {
741            return Ok(None);
742        };
743
744        // Decide registry source:
745        // - Explicit override: strict mode. Missing kind/device == hard error.
746        // - Lazy discover: opportunistic. Missing kind == silent fallback to
747        //   baked-in defaults so prod hosts without CDI installed keep
748        //   working.
749        let (registry, strict) = if let Some(reg) = &self.cdi_registry {
750            (reg.clone(), true)
751        } else {
752            let reg = Arc::new(CdiRegistry::discover());
753            if reg.is_empty() {
754                return Ok(None);
755            }
756            (reg, false)
757        };
758
759        let device_names: Vec<String> = (0..gpu.count).map(|i| i.to_string()).collect();
760
761        match registry.resolve_for_kind(kind, &device_names) {
762            Ok(edits) => Ok(Some(edits)),
763            Err(err) => {
764                if strict {
765                    Err(AgentError::InvalidSpec(format!(
766                        "CDI resolution failed for vendor '{}': {err}",
767                        gpu.vendor
768                    )))
769                } else {
770                    tracing::warn!(
771                        vendor = %gpu.vendor,
772                        kind = %kind,
773                        error = %err,
774                        "CDI resolution failed; falling back to baked-in GPU device passthrough"
775                    );
776                    Ok(None)
777                }
778            }
779        }
780    }
781
782    /// Build the OCI runtime spec from `ServiceSpec`.
783    ///
784    /// The full, CDI-aware implementation that backs both
785    /// [`BundleBuilder::build_spec_only`] (cross-platform, public) and the
786    /// Unix-only [`BundleBuilder::build`] / [`BundleBuilder::write_config`]
787    /// paths that additionally manage the bundle directory on disk.
788    ///
789    /// # Errors
790    /// Returns [`AgentError::InvalidSpec`] if any of the OCI `*Builder` types
791    /// reject the configuration, or if environment-variable secret resolution
792    /// fails.
793    ///
794    /// # Panics
795    /// Panics if the builder-internal `MountBuilder::build()` call fails for
796    /// the optional `ZLayer` API socket bind-mount. This is only reachable when
797    /// [`BundleBuilder::with_socket_mount`] has been used with a malformed
798    /// path, and is treated as a programmer error because all fields are
799    /// statically constructed from known-good inputs.
800    #[allow(clippy::too_many_lines)]
801    async fn build_oci_spec(
802        &self,
803        container_id: &ContainerId,
804        spec: &ServiceSpec,
805        volume_paths: &std::collections::HashMap<String, PathBuf>,
806    ) -> Result<Spec> {
807        // Resolve CDI edits up front. When present, these replace the
808        // baked-in vendor device-node / env injection below; when absent
809        // (no CDI installed, unknown vendor), the legacy code paths run.
810        let cdi_edits = self.resolve_cdi_edits(spec)?;
811
812        // Build user: image config user > root (spec doesn't currently have user override)
813        let user = {
814            let (uid, gid) = if let Some(user_str) = self
815                .image_config
816                .as_ref()
817                .and_then(|c| c.user.as_ref())
818                .filter(|u| !u.is_empty())
819            {
820                // Parse "uid:gid" or "uid" format from image config
821                let parts: Vec<&str> = user_str.splitn(2, ':').collect();
822                let uid = parts[0].parse::<u32>().unwrap_or(0);
823                let gid = if parts.len() > 1 {
824                    parts[1].parse::<u32>().unwrap_or(0)
825                } else {
826                    uid
827                };
828                (uid, gid)
829            } else {
830                (0u32, 0u32)
831            };
832
833            UserBuilder::default()
834                .uid(uid)
835                .gid(gid)
836                .build()
837                .map_err(|e| AgentError::InvalidSpec(format!("failed to build user: {e}")))?
838        };
839
840        // Build environment variables
841        // Layer: image config env (base) -> defaults -> spec env -> builder extra env
842        let mut env: Vec<String> = Vec::new();
843        let mut env_keys: HashSet<String> = HashSet::new();
844
845        // Seed with image config env first (lowest priority)
846        if let Some(img_env) = self.image_config.as_ref().and_then(|c| c.env.as_ref()) {
847            for entry in img_env {
848                if let Some(key) = entry.split('=').next() {
849                    env_keys.insert(key.to_string());
850                }
851                env.push(entry.clone());
852            }
853        }
854
855        // If image config didn't provide PATH, add the default
856        if !env_keys.contains("PATH") {
857            env.push(
858                "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(),
859            );
860            env_keys.insert("PATH".to_string());
861        }
862
863        // Add TERM for interactive compatibility (if not already set)
864        if !env_keys.contains("TERM") {
865            env.push("TERM=xterm".to_string());
866            env_keys.insert("TERM".to_string());
867        }
868
869        // Add service-specific env vars, resolving $S: and $E: prefixed references
870        // These override image config env for same keys
871        //
872        // When a secrets provider is available, use the full secrets-aware resolver
873        // that handles both $S: (secret) and $E: (env) prefixed values.
874        // Otherwise fall back to the env-only resolver.
875        if let (Some(secrets_provider), Some(scope)) =
876            (&self.secrets_provider, &self.deployment_scope)
877        {
878            let resolved_map =
879                crate::env::resolve_env_with_secrets(&spec.env, secrets_provider.as_ref(), scope)
880                    .await
881                    .map_err(|e| {
882                        AgentError::InvalidSpec(format!(
883                            "environment variable resolution failed: {e}"
884                        ))
885                    })?;
886
887            for (key, value) in &resolved_map {
888                if env_keys.contains(key.as_str()) {
889                    env.retain(|e| e.split('=').next() != Some(key.as_str()));
890                }
891                env_keys.insert(key.clone());
892                env.push(format!("{key}={value}"));
893            }
894        } else {
895            let resolved = crate::env::resolve_env_vars_with_warnings(&spec.env).map_err(|e| {
896                AgentError::InvalidSpec(format!("environment variable resolution failed: {e}"))
897            })?;
898
899            // Log any warnings about resolved env vars
900            for warning in &resolved.warnings {
901                tracing::warn!(container = %container_id, "{}", warning);
902            }
903
904            // Merge spec env: spec values take precedence over image config for same keys
905            for var in &resolved.vars {
906                if let Some(key) = var.split('=').next() {
907                    if env_keys.contains(key) {
908                        // Remove the old entry from image config
909                        env.retain(|e| e.split('=').next() != Some(key));
910                    }
911                    env_keys.insert(key.to_string());
912                }
913                env.push(var.clone());
914            }
915        }
916
917        // Add extra env vars from builder (highest priority)
918        for (key, value) in &self.extra_env {
919            if env_keys.contains(key.as_str()) {
920                env.retain(|e| e.split('=').next() != Some(key.as_str()));
921            }
922            env_keys.insert(key.clone());
923            env.push(format!("{key}={value}"));
924        }
925
926        // GPU device visibility environment variables.
927        //
928        // When CDI edits are available, the vendor-supplied spec is the
929        // source of truth (e.g. NVIDIA's `nvidia-ctk cdi generate` emits
930        // `NVIDIA_VISIBLE_DEVICES` plus driver-capability env on every
931        // device entry). Otherwise fall back to the historical baked-in
932        // strings so non-CDI hosts continue to advertise the right devices
933        // to CUDA/ROCm/oneAPI runtimes.
934        if let Some(ref edits_per_device) = cdi_edits {
935            for edits in edits_per_device {
936                for entry in &edits.env {
937                    if let Some(key) = entry.split('=').next() {
938                        if env_keys.contains(key) {
939                            env.retain(|e| e.split('=').next() != Some(key));
940                        }
941                        env_keys.insert(key.to_string());
942                    }
943                    env.push(entry.clone());
944                }
945            }
946        } else if let Some(ref gpu) = spec.resources.gpu {
947            // Default to 0..count when no explicit indices are provided
948            let indices: Vec<String> = (0..gpu.count).map(|i| i.to_string()).collect();
949            let device_list = indices.join(",");
950            match gpu.vendor.as_str() {
951                "nvidia" => {
952                    env.push(format!("NVIDIA_VISIBLE_DEVICES={device_list}"));
953                    env.push(format!("CUDA_VISIBLE_DEVICES={device_list}"));
954                }
955                "amd" => {
956                    env.push(format!("ROCR_VISIBLE_DEVICES={device_list}"));
957                    env.push(format!("HIP_VISIBLE_DEVICES={device_list}"));
958                }
959                "intel" => {
960                    env.push(format!("ZE_AFFINITY_MASK={device_list}"));
961                }
962                _ => {}
963            }
964        }
965
966        // GPU sharing (MPS / time-slicing) env injection.
967        //
968        // Layered on top of the CDI / baked-in `*_VISIBLE_DEVICES` block above:
969        // * MPS: validate host pipe/log dirs exist (error otherwise) and
970        //   export `CUDA_MPS_PIPE_DIRECTORY` / `CUDA_MPS_LOG_DIRECTORY`.
971        // * Time-slicing: override `CUDA_VISIBLE_DEVICES` to the configured
972        //   slice index so the workload sees a single virtualised GPU rather
973        //   than the full 0..count list emitted above.
974        //
975        // The mount side (bind-mounting the MPS dirs / time-slicing config
976        // file) is handled further down where the rest of the mounts get
977        // assembled.
978        let mps_dirs = if let Some(ref gpu) = spec.resources.gpu {
979            resolve_mps_dirs(gpu)?
980        } else {
981            None
982        };
983        if let Some(ref dirs) = mps_dirs {
984            let pipe = format!("CUDA_MPS_PIPE_DIRECTORY={}", dirs.pipe_dir.display());
985            let log = format!("CUDA_MPS_LOG_DIRECTORY={}", dirs.log_dir.display());
986            if env_keys.contains("CUDA_MPS_PIPE_DIRECTORY") {
987                env.retain(|e| e.split('=').next() != Some("CUDA_MPS_PIPE_DIRECTORY"));
988            }
989            if env_keys.contains("CUDA_MPS_LOG_DIRECTORY") {
990                env.retain(|e| e.split('=').next() != Some("CUDA_MPS_LOG_DIRECTORY"));
991            }
992            env_keys.insert("CUDA_MPS_PIPE_DIRECTORY".to_string());
993            env_keys.insert("CUDA_MPS_LOG_DIRECTORY".to_string());
994            env.push(pipe);
995            env.push(log);
996        }
997        if let Some(ref gpu) = spec.resources.gpu {
998            if gpu.sharing == Some(GpuSharingMode::TimeSlice) {
999                if let Some(idx) = gpu.time_slice_index {
1000                    // Time-slicing virtualises a single physical GPU as N
1001                    // slices; the workload sees one device, addressed by
1002                    // its slice index. Override whatever the CDI / baked-in
1003                    // path emitted earlier.
1004                    env.retain(|e| e.split('=').next() != Some("CUDA_VISIBLE_DEVICES"));
1005                    env_keys.insert("CUDA_VISIBLE_DEVICES".to_string());
1006                    env.push(format!("CUDA_VISIBLE_DEVICES={idx}"));
1007                }
1008            }
1009        }
1010
1011        // Inject distributed training coordination env vars when configured.
1012        // MASTER_ADDR uses the service DNS name (resolved by the overlay DNS).
1013        // RANK defaults to 0 (overridden by the agent when placing specific replicas).
1014        if let Some(ref gpu) = spec.resources.gpu {
1015            if let Some(ref dist) = gpu.distributed {
1016                env.push(format!("MASTER_PORT={}", dist.master_port));
1017                env.push(format!("MASTER_ADDR={}", container_id.service));
1018                env.push("WORLD_SIZE=1".to_string());
1019                env.push("RANK=0".to_string());
1020                env.push("LOCAL_RANK=0".to_string());
1021                match dist.backend.as_str() {
1022                    "nccl" => env.push("NCCL_SOCKET_IFNAME=eth0".to_string()),
1023                    "gloo" => env.push("GLOO_SOCKET_IFNAME=eth0".to_string()),
1024                    _ => {}
1025                }
1026            }
1027        }
1028
1029        // Build capabilities
1030        let capabilities = self.build_capabilities(spec)?;
1031
1032        // Determine working directory: builder override > spec.command.workdir > image config > "/"
1033        let cwd = self
1034            .cwd
1035            .clone()
1036            .or_else(|| spec.command.workdir.clone())
1037            .or_else(|| {
1038                self.image_config
1039                    .as_ref()
1040                    .and_then(|c| c.working_dir.as_ref())
1041                    .filter(|w| !w.is_empty())
1042                    .cloned()
1043            })
1044            .unwrap_or_else(|| "/".to_string());
1045
1046        // Resolve process args: builder override > spec command > image config > /bin/sh
1047        let process_args = if let Some(ref args) = self.args {
1048            args.clone()
1049        } else {
1050            Self::resolve_command_from_spec(spec, self.image_config.as_ref())
1051        };
1052
1053        // Build process
1054        let mut process_builder = ProcessBuilder::default()
1055            .terminal(false)
1056            .user(user)
1057            .env(env)
1058            .args(process_args)
1059            .cwd(cwd)
1060            .no_new_privileges(!spec.privileged && spec.capabilities.is_empty());
1061
1062        // Set capabilities if we have them
1063        if let Some(caps) = capabilities {
1064            process_builder = process_builder.capabilities(caps);
1065        }
1066
1067        let process = process_builder
1068            .build()
1069            .map_err(|e| AgentError::InvalidSpec(format!("failed to build process: {e}")))?;
1070
1071        // Build root filesystem config
1072        // Note: "rootfs" is relative to the bundle directory per OCI spec
1073        let root = RootBuilder::default()
1074            .path("rootfs".to_string())
1075            .readonly(false)
1076            .build()
1077            .map_err(|e| AgentError::InvalidSpec(format!("failed to build root: {e}")))?;
1078
1079        // Build default mounts
1080        let mut mounts = self.build_default_mounts(spec)?;
1081
1082        // Add storage mounts from spec
1083        let storage_mounts = self.build_storage_mounts(spec, volume_paths)?;
1084        mounts.extend(storage_mounts);
1085
1086        // Add ZLayer API socket bind-mount if configured.
1087        // Use typ("bind") so libcontainer's mount code handles the source path
1088        // correctly for sockets (canonicalize + file-based mount point creation).
1089        if let Some(ref socket_path) = self.socket_path {
1090            mounts.push(
1091                MountBuilder::default()
1092                    .destination(zlayer_paths::ZLayerDirs::default_socket_path())
1093                    .typ("bind")
1094                    .source(socket_path.clone())
1095                    .options(vec!["rbind".into(), "ro".into()])
1096                    .build()
1097                    .expect("valid socket mount"),
1098            );
1099        }
1100
1101        // Append CDI-provided mounts (e.g. vendor driver libraries that the
1102        // GPU runtime needs to expose to the container).
1103        if let Some(ref edits_per_device) = cdi_edits {
1104            for edits in edits_per_device {
1105                for cdi_mount in &edits.mounts {
1106                    let mut opts = cdi_mount.options.clone();
1107                    if !opts.iter().any(|o| o == "bind" || o == "rbind") {
1108                        opts.push("rbind".to_string());
1109                    }
1110                    mounts.push(
1111                        MountBuilder::default()
1112                            .destination(cdi_mount.container_path.clone())
1113                            .typ("bind")
1114                            .source(cdi_mount.host_path.clone())
1115                            .options(opts)
1116                            .build()
1117                            .map_err(|e| {
1118                                AgentError::InvalidSpec(format!("failed to build CDI mount: {e}"))
1119                            })?,
1120                    );
1121                }
1122            }
1123        }
1124
1125        // GPU sharing mounts.
1126        //
1127        // MPS: bind-mount the host pipe / log directories into the container
1128        // at the same path so the in-container CUDA runtime can talk to the
1129        // MPS daemon over its UNIX socket and append to the shared log.
1130        // The env vars (`CUDA_MPS_PIPE_DIRECTORY` / `CUDA_MPS_LOG_DIRECTORY`)
1131        // are exported earlier in the env-assembly block.
1132        //
1133        // Time-slicing: optionally surface the host's slicing config YAML at
1134        // a well-known read-only path so introspection tools inside the
1135        // container can read it.
1136        if let Some(ref dirs) = mps_dirs {
1137            mounts.push(
1138                MountBuilder::default()
1139                    .destination(dirs.pipe_dir.clone())
1140                    .typ("bind")
1141                    .source(dirs.pipe_dir.clone())
1142                    .options(vec!["rbind".into(), "rw".into()])
1143                    .build()
1144                    .map_err(|e| {
1145                        AgentError::InvalidSpec(format!("failed to build MPS pipe mount: {e}"))
1146                    })?,
1147            );
1148            mounts.push(
1149                MountBuilder::default()
1150                    .destination(dirs.log_dir.clone())
1151                    .typ("bind")
1152                    .source(dirs.log_dir.clone())
1153                    .options(vec!["rbind".into(), "rw".into()])
1154                    .build()
1155                    .map_err(|e| {
1156                        AgentError::InvalidSpec(format!("failed to build MPS log mount: {e}"))
1157                    })?,
1158            );
1159        }
1160        if let Some(ref gpu) = spec.resources.gpu {
1161            if gpu.sharing == Some(GpuSharingMode::TimeSlice) {
1162                if let Some(ref cfg_path) = gpu.time_slicing_config_path {
1163                    let host = PathBuf::from(cfg_path);
1164                    if !host.is_file() {
1165                        return Err(AgentError::GpuSharingUnavailable {
1166                            mode: "time-slice".to_string(),
1167                            reason: format!(
1168                                "time-slicing config {} is not a regular file on the host",
1169                                host.display()
1170                            ),
1171                        });
1172                    }
1173                    mounts.push(
1174                        MountBuilder::default()
1175                            .destination(PathBuf::from(TIMESLICE_CONFIG_CONTAINER_PATH))
1176                            .typ("bind")
1177                            .source(host)
1178                            .options(vec!["rbind".into(), "ro".into()])
1179                            .build()
1180                            .map_err(|e| {
1181                                AgentError::InvalidSpec(format!(
1182                                    "failed to build time-slicing config mount: {e}"
1183                                ))
1184                            })?,
1185                    );
1186                }
1187            }
1188        }
1189
1190        // Build Linux-specific config
1191        let linux = self.build_linux_config(container_id, spec, cdi_edits.as_deref())?;
1192
1193        // Determine hostname
1194        let hostname = self
1195            .hostname
1196            .clone()
1197            .unwrap_or_else(|| container_id.to_string());
1198
1199        // Build the complete spec, attaching any CDI-provided hooks.
1200        let mut spec_builder = SpecBuilder::default()
1201            .version("1.0.2".to_string())
1202            .root(root)
1203            .process(process)
1204            .hostname(hostname)
1205            .mounts(mounts)
1206            .linux(linux);
1207
1208        if let Some(ref edits_per_device) = cdi_edits {
1209            if let Some(hooks) = Self::build_hooks_from_cdi(edits_per_device)? {
1210                spec_builder = spec_builder.hooks(hooks);
1211            }
1212        }
1213
1214        let oci_spec = spec_builder
1215            .build()
1216            .map_err(|e| AgentError::InvalidSpec(format!("failed to build OCI spec: {e}")))?;
1217
1218        Ok(oci_spec)
1219    }
1220
1221    /// Convert the union of CDI hooks across all resolved devices into an
1222    /// OCI [`Hooks`] block.
1223    ///
1224    /// Returns `Ok(None)` when no device contributed hooks (so the spec
1225    /// builder skips the empty block — `oci-spec` treats `null` as "no
1226    /// hooks" while serializers may emit empty arrays otherwise).
1227    fn build_hooks_from_cdi(edits_per_device: &[CdiContainerEdits]) -> Result<Option<Hooks>> {
1228        let mut prestart: Vec<Hook> = Vec::new();
1229        let mut create_runtime: Vec<Hook> = Vec::new();
1230        let mut create_container: Vec<Hook> = Vec::new();
1231        let mut start_container: Vec<Hook> = Vec::new();
1232        let mut poststart: Vec<Hook> = Vec::new();
1233        let mut poststop: Vec<Hook> = Vec::new();
1234
1235        for edits in edits_per_device {
1236            let Some(ref h) = edits.hooks else { continue };
1237            for hook in &h.prestart {
1238                prestart.push(convert_cdi_hook(hook)?);
1239            }
1240            for hook in &h.create_runtime {
1241                create_runtime.push(convert_cdi_hook(hook)?);
1242            }
1243            for hook in &h.create_container {
1244                create_container.push(convert_cdi_hook(hook)?);
1245            }
1246            for hook in &h.start_container {
1247                start_container.push(convert_cdi_hook(hook)?);
1248            }
1249            for hook in &h.poststart {
1250                poststart.push(convert_cdi_hook(hook)?);
1251            }
1252            for hook in &h.poststop {
1253                poststop.push(convert_cdi_hook(hook)?);
1254            }
1255        }
1256
1257        if prestart.is_empty()
1258            && create_runtime.is_empty()
1259            && create_container.is_empty()
1260            && start_container.is_empty()
1261            && poststart.is_empty()
1262            && poststop.is_empty()
1263        {
1264            return Ok(None);
1265        }
1266
1267        let mut builder = HooksBuilder::default();
1268        if !prestart.is_empty() {
1269            #[allow(deprecated)]
1270            {
1271                builder = builder.prestart(prestart);
1272            }
1273        }
1274        if !create_runtime.is_empty() {
1275            builder = builder.create_runtime(create_runtime);
1276        }
1277        if !create_container.is_empty() {
1278            builder = builder.create_container(create_container);
1279        }
1280        if !start_container.is_empty() {
1281            builder = builder.start_container(start_container);
1282        }
1283        if !poststart.is_empty() {
1284            builder = builder.poststart(poststart);
1285        }
1286        if !poststop.is_empty() {
1287            builder = builder.poststop(poststop);
1288        }
1289
1290        let hooks = builder
1291            .build()
1292            .map_err(|e| AgentError::InvalidSpec(format!("failed to build CDI hooks: {e}")))?;
1293        Ok(Some(hooks))
1294    }
1295
1296    /// Build Linux capabilities configuration
1297    #[allow(clippy::unused_self)]
1298    fn build_capabilities(
1299        &self,
1300        spec: &ServiceSpec,
1301    ) -> Result<Option<oci_spec::runtime::LinuxCapabilities>> {
1302        if spec.privileged {
1303            // Privileged mode: all capabilities
1304            let all_caps: HashSet<Capability> = ALL_CAPABILITIES.iter().copied().collect();
1305            let empty_caps: HashSet<Capability> = HashSet::new();
1306
1307            let caps = LinuxCapabilitiesBuilder::default()
1308                .bounding(all_caps.clone())
1309                .effective(all_caps.clone())
1310                .permitted(all_caps)
1311                .inheritable(empty_caps.clone())
1312                .ambient(empty_caps)
1313                .build()
1314                .map_err(|e| {
1315                    AgentError::InvalidSpec(format!("failed to build capabilities: {e}"))
1316                })?;
1317
1318            Ok(Some(caps))
1319        } else if !spec.capabilities.is_empty() {
1320            // Specific capabilities requested
1321            let caps: HashSet<Capability> = spec
1322                .capabilities
1323                .iter()
1324                .filter_map(|c| {
1325                    // Normalize capability name (add CAP_ prefix if missing, uppercase)
1326                    let cap_name = if c.starts_with("CAP_") {
1327                        c.to_uppercase()
1328                    } else {
1329                        format!("CAP_{}", c.to_uppercase())
1330                    };
1331                    Capability::from_str(&cap_name).ok()
1332                })
1333                .collect();
1334
1335            let empty_caps: HashSet<Capability> = HashSet::new();
1336
1337            let built_caps = LinuxCapabilitiesBuilder::default()
1338                .bounding(caps.clone())
1339                .effective(caps.clone())
1340                .permitted(caps)
1341                .inheritable(empty_caps.clone())
1342                .ambient(empty_caps)
1343                .build()
1344                .map_err(|e| {
1345                    AgentError::InvalidSpec(format!("failed to build capabilities: {e}"))
1346                })?;
1347
1348            Ok(Some(built_caps))
1349        } else {
1350            // Default: minimal capabilities for basic container operation
1351            let default_caps: HashSet<Capability> = [
1352                Capability::Chown,
1353                Capability::DacOverride,
1354                Capability::Fsetid,
1355                Capability::Fowner,
1356                Capability::Mknod,
1357                Capability::NetRaw,
1358                Capability::Setgid,
1359                Capability::Setuid,
1360                Capability::Setfcap,
1361                Capability::Setpcap,
1362                Capability::NetBindService,
1363                Capability::SysChroot,
1364                Capability::Kill,
1365                Capability::AuditWrite,
1366            ]
1367            .into_iter()
1368            .collect();
1369
1370            let empty_caps: HashSet<Capability> = HashSet::new();
1371
1372            let built_caps = LinuxCapabilitiesBuilder::default()
1373                .bounding(default_caps.clone())
1374                .effective(default_caps.clone())
1375                .permitted(default_caps)
1376                .inheritable(empty_caps.clone())
1377                .ambient(empty_caps)
1378                .build()
1379                .map_err(|e| {
1380                    AgentError::InvalidSpec(format!("failed to build capabilities: {e}"))
1381                })?;
1382
1383            Ok(Some(built_caps))
1384        }
1385    }
1386
1387    /// Build default filesystem mounts for the container
1388    #[allow(clippy::unused_self, clippy::too_many_lines)]
1389    fn build_default_mounts(&self, spec: &ServiceSpec) -> Result<Vec<Mount>> {
1390        let mut mounts = Vec::new();
1391
1392        // /proc
1393        mounts.push(
1394            MountBuilder::default()
1395                .destination("/proc".to_string())
1396                .typ("proc".to_string())
1397                .source("proc".to_string())
1398                .options(vec![
1399                    "nosuid".to_string(),
1400                    "noexec".to_string(),
1401                    "nodev".to_string(),
1402                ])
1403                .build()
1404                .map_err(|e| {
1405                    AgentError::InvalidSpec(format!("failed to build /proc mount: {e}"))
1406                })?,
1407        );
1408
1409        // /dev
1410        mounts.push(
1411            MountBuilder::default()
1412                .destination("/dev".to_string())
1413                .typ("tmpfs".to_string())
1414                .source("tmpfs".to_string())
1415                .options(vec![
1416                    "nosuid".to_string(),
1417                    "strictatime".to_string(),
1418                    "mode=755".to_string(),
1419                    "size=65536k".to_string(),
1420                ])
1421                .build()
1422                .map_err(|e| AgentError::InvalidSpec(format!("failed to build /dev mount: {e}")))?,
1423        );
1424
1425        // /dev/pts
1426        mounts.push(
1427            MountBuilder::default()
1428                .destination("/dev/pts".to_string())
1429                .typ("devpts".to_string())
1430                .source("devpts".to_string())
1431                .options(vec![
1432                    "nosuid".to_string(),
1433                    "noexec".to_string(),
1434                    "newinstance".to_string(),
1435                    "ptmxmode=0666".to_string(),
1436                    "mode=0620".to_string(),
1437                    "gid=5".to_string(),
1438                ])
1439                .build()
1440                .map_err(|e| {
1441                    AgentError::InvalidSpec(format!("failed to build /dev/pts mount: {e}"))
1442                })?,
1443        );
1444
1445        // /dev/shm
1446        mounts.push(
1447            MountBuilder::default()
1448                .destination("/dev/shm".to_string())
1449                .typ("tmpfs".to_string())
1450                .source("shm".to_string())
1451                .options(vec![
1452                    "nosuid".to_string(),
1453                    "noexec".to_string(),
1454                    "nodev".to_string(),
1455                    "mode=1777".to_string(),
1456                    "size=65536k".to_string(),
1457                ])
1458                .build()
1459                .map_err(|e| {
1460                    AgentError::InvalidSpec(format!("failed to build /dev/shm mount: {e}"))
1461                })?,
1462        );
1463
1464        // /dev/mqueue
1465        mounts.push(
1466            MountBuilder::default()
1467                .destination("/dev/mqueue".to_string())
1468                .typ("mqueue".to_string())
1469                .source("mqueue".to_string())
1470                .options(vec![
1471                    "nosuid".to_string(),
1472                    "noexec".to_string(),
1473                    "nodev".to_string(),
1474                ])
1475                .build()
1476                .map_err(|e| {
1477                    AgentError::InvalidSpec(format!("failed to build /dev/mqueue mount: {e}"))
1478                })?,
1479        );
1480
1481        // /sys - read-only unless privileged
1482        let sys_options = if spec.privileged {
1483            vec![
1484                "nosuid".to_string(),
1485                "noexec".to_string(),
1486                "nodev".to_string(),
1487            ]
1488        } else {
1489            vec![
1490                "nosuid".to_string(),
1491                "noexec".to_string(),
1492                "nodev".to_string(),
1493                "ro".to_string(),
1494            ]
1495        };
1496
1497        mounts.push(
1498            MountBuilder::default()
1499                .destination("/sys".to_string())
1500                .typ("sysfs".to_string())
1501                .source("sysfs".to_string())
1502                .options(sys_options)
1503                .build()
1504                .map_err(|e| AgentError::InvalidSpec(format!("failed to build /sys mount: {e}")))?,
1505        );
1506
1507        // /sys/fs/cgroup - for cgroup access
1508        mounts.push(
1509            MountBuilder::default()
1510                .destination("/sys/fs/cgroup".to_string())
1511                .typ("cgroup2".to_string())
1512                .source("cgroup".to_string())
1513                .options(vec![
1514                    "nosuid".to_string(),
1515                    "noexec".to_string(),
1516                    "nodev".to_string(),
1517                    "relatime".to_string(),
1518                ])
1519                .build()
1520                .map_err(|e| {
1521                    AgentError::InvalidSpec(format!("failed to build cgroup mount: {e}"))
1522                })?,
1523        );
1524
1525        Ok(mounts)
1526    }
1527
1528    /// Build storage mounts from `ServiceSpec` storage entries
1529    ///
1530    /// Converts `StorageSpec` entries to OCI Mount entries.
1531    /// Note: Named and Anonymous volumes require `StorageManager` to prepare paths.
1532    /// S3 volumes require s3fs FUSE mount (handled separately).
1533    #[allow(clippy::unused_self, clippy::too_many_lines)]
1534    fn build_storage_mounts(
1535        &self,
1536        spec: &ServiceSpec,
1537        volume_paths: &std::collections::HashMap<String, PathBuf>,
1538    ) -> Result<Vec<Mount>> {
1539        let mut mounts = Vec::new();
1540
1541        for storage in &spec.storage {
1542            let mount = match storage {
1543                StorageSpec::Bind {
1544                    source,
1545                    target,
1546                    readonly,
1547                } => {
1548                    let mut options = vec!["rbind".to_string()];
1549                    if *readonly {
1550                        options.push("ro".to_string());
1551                    } else {
1552                        options.push("rw".to_string());
1553                    }
1554
1555                    MountBuilder::default()
1556                        .destination(target.clone())
1557                        .typ("none".to_string())
1558                        .source(source.clone())
1559                        .options(options)
1560                        .build()
1561                        .map_err(|e| {
1562                            AgentError::InvalidSpec(format!(
1563                                "failed to build bind mount for {target}: {e}"
1564                            ))
1565                        })?
1566                }
1567
1568                StorageSpec::Named {
1569                    name,
1570                    target,
1571                    readonly,
1572                    tier,
1573                    ..
1574                } => {
1575                    // Get the prepared volume path from StorageManager
1576                    let source = volume_paths.get(name).ok_or_else(|| {
1577                        AgentError::InvalidSpec(format!(
1578                            "volume '{name}' not prepared - ensure StorageManager.ensure_volume() was called"
1579                        ))
1580                    })?;
1581
1582                    // Warn about SQLite safety for non-local tiers
1583                    if matches!(tier, StorageTier::Network) {
1584                        tracing::warn!(
1585                            volume = %name,
1586                            tier = ?tier,
1587                            "Network storage tier is NOT SQLite-safe. Avoid using SQLite databases on this volume."
1588                        );
1589                    }
1590
1591                    let mut options = vec!["rbind".to_string()];
1592                    if *readonly {
1593                        options.push("ro".to_string());
1594                    } else {
1595                        options.push("rw".to_string());
1596                    }
1597
1598                    MountBuilder::default()
1599                        .destination(target.clone())
1600                        .typ("none".to_string())
1601                        .source(source.to_string_lossy().to_string())
1602                        .options(options)
1603                        .build()
1604                        .map_err(|e| {
1605                            AgentError::InvalidSpec(format!(
1606                                "failed to build named volume mount for {target}: {e}"
1607                            ))
1608                        })?
1609                }
1610
1611                StorageSpec::Anonymous { target, tier } => {
1612                    // Anonymous volumes should have been created by StorageManager
1613                    // and the path passed in volume_paths with key "_anon_{target}"
1614                    let key = format!("_anon_{}", target.trim_start_matches('/').replace('/', "_"));
1615                    let source = volume_paths.get(&key).ok_or_else(|| {
1616                        AgentError::InvalidSpec(format!(
1617                            "anonymous volume for '{target}' not prepared"
1618                        ))
1619                    })?;
1620
1621                    if matches!(tier, StorageTier::Network) {
1622                        tracing::warn!(
1623                            target = %target,
1624                            tier = ?tier,
1625                            "Network storage tier is NOT SQLite-safe."
1626                        );
1627                    }
1628
1629                    let options = vec!["rbind".to_string(), "rw".to_string()];
1630
1631                    MountBuilder::default()
1632                        .destination(target.clone())
1633                        .typ("none".to_string())
1634                        .source(source.to_string_lossy().to_string())
1635                        .options(options)
1636                        .build()
1637                        .map_err(|e| {
1638                            AgentError::InvalidSpec(format!(
1639                                "failed to build anonymous volume mount for {target}: {e}"
1640                            ))
1641                        })?
1642                }
1643
1644                StorageSpec::Tmpfs { target, size, mode } => {
1645                    let mut options = vec!["nosuid".to_string(), "nodev".to_string()];
1646
1647                    if let Some(size_str) = size {
1648                        options.push(format!("size={size_str}"));
1649                    }
1650
1651                    if let Some(mode_val) = mode {
1652                        options.push(format!("mode={mode_val:o}"));
1653                    }
1654
1655                    MountBuilder::default()
1656                        .destination(target.clone())
1657                        .typ("tmpfs".to_string())
1658                        .source("tmpfs".to_string())
1659                        .options(options)
1660                        .build()
1661                        .map_err(|e| {
1662                            AgentError::InvalidSpec(format!(
1663                                "failed to build tmpfs mount for {target}: {e}"
1664                            ))
1665                        })?
1666                }
1667
1668                StorageSpec::S3 {
1669                    bucket,
1670                    prefix,
1671                    target,
1672                    readonly,
1673                    endpoint: _,
1674                    credentials: _,
1675                } => {
1676                    // S3 mounts are handled via s3fs FUSE
1677                    // The StorageManager should have mounted the bucket and passed the path
1678                    let key = format!("_s3_{}_{}", bucket, prefix.as_deref().unwrap_or(""));
1679                    let source = volume_paths.get(&key).ok_or_else(|| {
1680                        AgentError::InvalidSpec(format!(
1681                            "S3 volume for bucket '{bucket}' not mounted - ensure StorageManager.mount_s3() was called"
1682                        ))
1683                    })?;
1684
1685                    tracing::warn!(
1686                        bucket = %bucket,
1687                        target = %target,
1688                        "S3 storage is NOT SQLite-safe. Use for read-heavy workloads only."
1689                    );
1690
1691                    let mut options = vec!["rbind".to_string()];
1692                    if *readonly {
1693                        options.push("ro".to_string());
1694                    } else {
1695                        options.push("rw".to_string());
1696                    }
1697
1698                    MountBuilder::default()
1699                        .destination(target.clone())
1700                        .typ("none".to_string())
1701                        .source(source.to_string_lossy().to_string())
1702                        .options(options)
1703                        .build()
1704                        .map_err(|e| {
1705                            AgentError::InvalidSpec(format!(
1706                                "failed to build S3 mount for {target}: {e}"
1707                            ))
1708                        })?
1709                }
1710            };
1711
1712            mounts.push(mount);
1713        }
1714
1715        Ok(mounts)
1716    }
1717
1718    /// Build Linux-specific configuration
1719    #[allow(clippy::similar_names)] // euid/egid are POSIX-standard paired names
1720    #[allow(clippy::too_many_lines)]
1721    fn build_linux_config(
1722        &self,
1723        container_id: &ContainerId,
1724        spec: &ServiceSpec,
1725        cdi_edits: Option<&[CdiContainerEdits]>,
1726    ) -> Result<oci_spec::runtime::Linux> {
1727        // Build namespaces
1728        let mut namespaces = vec![
1729            LinuxNamespaceBuilder::default()
1730                .typ(LinuxNamespaceType::Pid)
1731                .build()
1732                .unwrap(),
1733            LinuxNamespaceBuilder::default()
1734                .typ(LinuxNamespaceType::Ipc)
1735                .build()
1736                .unwrap(),
1737            LinuxNamespaceBuilder::default()
1738                .typ(LinuxNamespaceType::Uts)
1739                .build()
1740                .unwrap(),
1741            LinuxNamespaceBuilder::default()
1742                .typ(LinuxNamespaceType::Mount)
1743                .build()
1744                .unwrap(),
1745        ];
1746
1747        // Only add Network namespace when NOT using host networking.
1748        // In host networking mode, the container shares the host's network stack
1749        // (like Docker's --network host).
1750        if !self.host_network {
1751            namespaces.push(
1752                LinuxNamespaceBuilder::default()
1753                    .typ(LinuxNamespaceType::Network)
1754                    .build()
1755                    .unwrap(),
1756            );
1757        }
1758
1759        // `nix::unistd` is unix-only. On non-unix targets (Windows), libcontainer
1760        // is not the runtime path (HCS is) and this function is effectively dead
1761        // code — so we statically force `rootless = false` there and skip the
1762        // user-namespace mapping block entirely.
1763        #[cfg(unix)]
1764        let rootless = !nix::unistd::geteuid().is_root();
1765        #[cfg(not(unix))]
1766        let rootless = false;
1767
1768        if rootless {
1769            namespaces.push(
1770                LinuxNamespaceBuilder::default()
1771                    .typ(LinuxNamespaceType::User)
1772                    .build()
1773                    .unwrap(),
1774            );
1775            namespaces.push(
1776                LinuxNamespaceBuilder::default()
1777                    .typ(LinuxNamespaceType::Cgroup)
1778                    .build()
1779                    .unwrap(),
1780            );
1781        }
1782
1783        let mut linux_builder = LinuxBuilder::default().namespaces(namespaces);
1784
1785        #[cfg(unix)]
1786        if rootless {
1787            let euid = nix::unistd::geteuid();
1788            let egid = nix::unistd::getegid();
1789            let username = nix::unistd::User::from_uid(euid)
1790                .ok()
1791                .flatten()
1792                .map(|u| u.name)
1793                .unwrap_or_default();
1794            linux_builder = linux_builder
1795                .uid_mappings(build_rootless_id_mappings(
1796                    euid.as_raw(),
1797                    "/etc/subuid",
1798                    &username,
1799                ))
1800                .gid_mappings(build_rootless_id_mappings(
1801                    egid.as_raw(),
1802                    "/etc/subgid",
1803                    &username,
1804                ));
1805        }
1806
1807        // Build resources (CPU, memory, devices)
1808        let resources = self.build_resources(spec)?;
1809        if let Some(resources) = resources {
1810            linux_builder = linux_builder.resources(resources);
1811        }
1812
1813        // Build device entries for passthrough.
1814        //
1815        // When CDI edits are present, the vendor-supplied device-node list
1816        // replaces our baked-in vendor-specific defaults — CDI knows the
1817        // host's exact device geometry (which majors/minors map to which
1818        // GPUs) so we trust it over our static `/dev/nvidiaN` enumeration.
1819        let mut devices = self.build_devices(spec, None, cdi_edits.is_some())?;
1820        if let Some(edits_per_device) = cdi_edits {
1821            for edits in edits_per_device {
1822                for node in &edits.device_nodes {
1823                    devices.push(cdi_node_to_oci_device(node)?);
1824                }
1825            }
1826        }
1827        if !devices.is_empty() {
1828            linux_builder = linux_builder.devices(devices);
1829        }
1830
1831        // Set rootfs propagation (matches Docker default)
1832        linux_builder = linux_builder.rootfs_propagation("private".to_string());
1833
1834        // Set masked/readonly paths based on privileged mode
1835        if spec.privileged {
1836            // Privileged containers get no masked paths (full access)
1837            linux_builder = linux_builder.masked_paths(vec![]).readonly_paths(vec![]);
1838        } else {
1839            // Set masked paths for security (hide sensitive host info)
1840            let masked_paths = vec![
1841                "/proc/acpi".to_string(),
1842                "/proc/asound".to_string(),
1843                "/proc/kcore".to_string(),
1844                "/proc/keys".to_string(),
1845                "/proc/latency_stats".to_string(),
1846                "/proc/timer_list".to_string(),
1847                "/proc/timer_stats".to_string(),
1848                "/proc/sched_debug".to_string(),
1849                "/proc/scsi".to_string(),
1850                "/sys/firmware".to_string(),
1851            ];
1852
1853            // Set readonly paths for security
1854            let readonly_paths = vec![
1855                "/proc/bus".to_string(),
1856                "/proc/fs".to_string(),
1857                "/proc/irq".to_string(),
1858                "/proc/sys".to_string(),
1859                "/proc/sysrq-trigger".to_string(),
1860            ];
1861
1862            linux_builder = linux_builder
1863                .masked_paths(masked_paths)
1864                .readonly_paths(readonly_paths);
1865        }
1866
1867        // Determine cgroups_path so libcontainer creates the container cgroup
1868        // under the current process's cgroup rather than at the v2 root. This
1869        // is required when running inside another container (e.g. Forgejo CI
1870        // `container:` block) where `/sys/fs/cgroup/cgroup.subtree_control` is
1871        // read-only. Precedence:
1872        //   1. spec.cgroup_parent (per-service override)         — all platforms
1873        //   2. ZLAYER_CGROUP_PARENT env var (host-wide override) — all platforms
1874        //   3. /proc/self/cgroup (auto-detect when nested)       — Linux only
1875        //   4. unset (default — bare-metal happy path; also the WSL2-delegate
1876        //      case on non-Linux hosts, where libcontainer inside the WSL
1877        //      distro resolves the parent at `zlayer runtime create` time)
1878        let cid = container_id.to_string();
1879
1880        // Explicit overrides are honored on every platform: a user might pin a
1881        // cgroup_parent for a WSL-delegate-bound spec even when this process
1882        // is running on Windows.
1883        let explicit_parent: Option<(String, &'static str)> =
1884            if let Some(p) = spec.cgroup_parent.as_deref().filter(|s| !s.is_empty()) {
1885                Some((p.to_string(), "spec"))
1886            } else if let Some(p) = std::env::var("ZLAYER_CGROUP_PARENT")
1887                .ok()
1888                .filter(|s| !s.is_empty())
1889            {
1890                Some((p, "env"))
1891            } else {
1892                None
1893            };
1894
1895        // Auto-detect (and the "no writable parent" hard error below) are
1896        // Linux-only: they inspect /proc/self/cgroup and /sys/fs/cgroup, which
1897        // don't exist on Windows hosts. When the bundle is destined for the
1898        // WSL2 delegate, cgroup-parent resolution happens inside the distro
1899        // at `zlayer runtime create` time, not here on the host.
1900        #[cfg(target_os = "linux")]
1901        let auto_parent: Option<(String, &'static str)> =
1902            if let Some(p) = crate::capability::ensure_daemon_leaf_and_container_parent() {
1903                Some((p, "auto-init"))
1904            } else if let Some(p) = crate::capability::current_cgroup_v2_path() {
1905                // Fallback: migration failed (likely cgroup root is read-only); use the
1906                // raw scope path. Pre-fix behaviour — surfaces the original error.
1907                Some((p, "auto"))
1908            } else {
1909                None
1910            };
1911        #[cfg(not(target_os = "linux"))]
1912        let auto_parent: Option<(String, &'static str)> = None;
1913
1914        let (cgroup_parent_value, cgroup_parent_source): (Option<String>, &'static str) =
1915            explicit_parent
1916                .or(auto_parent)
1917                .map_or((None, "none"), |(p, s)| (Some(p), s));
1918
1919        // Diagnostic guard rail: capability survey says we're nested, but we
1920        // couldn't resolve a cgroup parent here. This combination should not
1921        // normally happen because both code paths consult the same
1922        // `current_cgroup_v2_path()` helper. Surface it so an operator can
1923        // investigate; do not fail container creation. Linux-only — the
1924        // capability survey is itself a no-op on non-Linux.
1925        #[cfg(target_os = "linux")]
1926        if cgroup_parent_value.is_none() && crate::capability::DaemonCapabilities::get().is_nested {
1927            tracing::warn!(
1928                container_id = %cid,
1929                "capability survey reports nested daemon but cgroup_parent could not be resolved — proceeding with v2 root"
1930            );
1931        }
1932
1933        if let Some(parent) = cgroup_parent_value {
1934            let parent = parent.trim_end_matches('/');
1935            let full = format!("{parent}/{cid}");
1936            match cgroup_parent_source {
1937                "spec" => tracing::info!(
1938                    container_id = %cid,
1939                    source = "spec",
1940                    path = %full,
1941                    "cgroup_parent selected"
1942                ),
1943                "env" => tracing::info!(
1944                    container_id = %cid,
1945                    source = "env",
1946                    path = %full,
1947                    "cgroup_parent selected"
1948                ),
1949                "auto" => tracing::info!(
1950                    container_id = %cid,
1951                    source = "auto",
1952                    path = %full,
1953                    "cgroup_parent selected (from /proc/self/cgroup)"
1954                ),
1955                "auto-init" => tracing::info!(
1956                    container_id = %cid,
1957                    source = "auto-init",
1958                    path = %full,
1959                    "cgroup_parent selected (migrated daemon to <scope>/init; containers go under <scope>/containers)"
1960                ),
1961                _ => unreachable!(),
1962            }
1963            linux_builder = linux_builder.cgroups_path(std::path::PathBuf::from(full));
1964        } else {
1965            // Auto-detect found nothing AND no explicit override. Behaviour
1966            // differs by platform:
1967            //   - Linux: this is a real error in nested-container envs where
1968            //     the cgroup root is read-only. Emit the hard error so an
1969            //     operator fixes the env.
1970            //   - Non-Linux (Windows host building a bundle for the WSL2
1971            //     delegate): expected path; cgroup setup happens inside the
1972            //     distro at runtime-create time.
1973            #[cfg(target_os = "linux")]
1974            {
1975                let caps = crate::capability::DaemonCapabilities::get();
1976                if !caps.can_write_cgroup_root {
1977                    return Err(AgentError::InvalidSpec(format!(
1978                        "cannot create container {cid}: no writable cgroup parent. \
1979                         /proc/self/cgroup reports the cgroup-v2 root, and \
1980                         /sys/fs/cgroup is read-only to this process. Fix one of: \
1981                         (a) run the daemon's outer container with --cgroupns=host \
1982                         so /proc/self/cgroup reports a real parent; \
1983                         (b) set ZLAYER_CGROUP_PARENT=/path/to/writable/cgroup; \
1984                         (c) grant the daemon write access to /sys/fs/cgroup."
1985                    )));
1986                }
1987                tracing::info!(
1988                    container_id = %cid,
1989                    "cgroup_parent unset — libcontainer will use v2 root (cgroup root is writable here)"
1990                );
1991            }
1992            #[cfg(not(target_os = "linux"))]
1993            tracing::debug!(
1994                container_id = %cid,
1995                "non-Linux host — cgroup_parent unset; libcontainer inside the WSL distro will resolve a parent from its cgroup-v2 root"
1996            );
1997        }
1998
1999        linux_builder
2000            .build()
2001            .map_err(|e| AgentError::InvalidSpec(format!("failed to build linux config: {e}")))
2002    }
2003
2004    /// Build resource limits (CPU, memory, device cgroups)
2005    #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
2006    fn build_resources(
2007        &self,
2008        spec: &ServiceSpec,
2009    ) -> Result<Option<oci_spec::runtime::LinuxResources>> {
2010        let mut resources_builder = LinuxResourcesBuilder::default();
2011        let mut has_resources = false;
2012
2013        // CPU limits
2014        if let Some(cpu_limit) = spec.resources.cpu {
2015            // Convert CPU cores to microseconds quota
2016            // 100000 microseconds = 1 core's worth of time per period
2017            let quota = (cpu_limit * 100_000.0) as i64;
2018            let cpu = LinuxCpuBuilder::default()
2019                .quota(quota)
2020                .period(100_000u64)
2021                .build()
2022                .map_err(|e| AgentError::InvalidSpec(format!("failed to build CPU limits: {e}")))?;
2023
2024            resources_builder = resources_builder.cpu(cpu);
2025            has_resources = true;
2026        }
2027
2028        // Memory limits
2029        if let Some(ref memory_str) = spec.resources.memory {
2030            let bytes = parse_memory_string(memory_str)
2031                .map_err(|e| AgentError::InvalidSpec(format!("invalid memory limit: {e}")))?;
2032
2033            let memory = LinuxMemoryBuilder::default()
2034                .limit(bytes as i64)
2035                .build()
2036                .map_err(|e| {
2037                    AgentError::InvalidSpec(format!("failed to build memory limits: {e}"))
2038                })?;
2039
2040            resources_builder = resources_builder.memory(memory);
2041            has_resources = true;
2042        }
2043
2044        // Device cgroup rules
2045        let device_rules = self.build_device_cgroup_rules(spec, None)?;
2046        if !device_rules.is_empty() {
2047            resources_builder = resources_builder.devices(device_rules);
2048            has_resources = true;
2049        }
2050
2051        if has_resources {
2052            let resources = resources_builder
2053                .build()
2054                .map_err(|e| AgentError::InvalidSpec(format!("failed to build resources: {e}")))?;
2055            Ok(Some(resources))
2056        } else {
2057            Ok(None)
2058        }
2059    }
2060
2061    /// Build device cgroup rules
2062    #[allow(clippy::unused_self, clippy::too_many_lines)]
2063    fn build_device_cgroup_rules(
2064        &self,
2065        spec: &ServiceSpec,
2066        _gpu_indices: Option<&[u32]>,
2067    ) -> Result<Vec<oci_spec::runtime::LinuxDeviceCgroup>> {
2068        let mut rules = Vec::new();
2069
2070        if spec.privileged {
2071            // Privileged mode: allow all devices
2072            let rule = LinuxDeviceCgroupBuilder::default()
2073                .allow(true)
2074                .access("rwm".to_string())
2075                .build()
2076                .map_err(|e| {
2077                    AgentError::InvalidSpec(format!("failed to build device cgroup rule: {e}"))
2078                })?;
2079            rules.push(rule);
2080        } else {
2081            // Default: deny all, then allow specific devices
2082            let deny_all = LinuxDeviceCgroupBuilder::default()
2083                .allow(false)
2084                .access("rwm".to_string())
2085                .build()
2086                .map_err(|e| AgentError::InvalidSpec(format!("failed to build deny rule: {e}")))?;
2087            rules.push(deny_all);
2088
2089            // Allow standard container devices
2090            // /dev/null, /dev/zero, /dev/full, /dev/random, /dev/urandom, /dev/tty
2091            let standard_char_devices = [
2092                (1, 3, "rwm"),    // /dev/null
2093                (1, 5, "rwm"),    // /dev/zero
2094                (1, 7, "rwm"),    // /dev/full
2095                (1, 8, "rwm"),    // /dev/random
2096                (1, 9, "rwm"),    // /dev/urandom
2097                (5, 0, "rwm"),    // /dev/tty
2098                (5, 1, "rwm"),    // /dev/console
2099                (5, 2, "rwm"),    // /dev/ptmx
2100                (136, -1, "rwm"), // /dev/pts/* (wildcard minor)
2101            ];
2102
2103            for (major, minor, access) in standard_char_devices {
2104                let mut builder = LinuxDeviceCgroupBuilder::default()
2105                    .allow(true)
2106                    .typ(LinuxDeviceType::C)
2107                    .major(i64::from(major))
2108                    .access(access.to_string());
2109
2110                if minor >= 0 {
2111                    builder = builder.minor(i64::from(minor));
2112                }
2113
2114                let rule = builder.build().map_err(|e| {
2115                    AgentError::InvalidSpec(format!("failed to build char device rule: {e}"))
2116                })?;
2117                rules.push(rule);
2118            }
2119
2120            // Allow specific devices from spec (Unix-only: requires /dev/* fs
2121            // probing via `MetadataExt::rdev`). On Windows the WSL2 delegate
2122            // path regenerates these inside the Linux distro, so we skip here.
2123            #[cfg(unix)]
2124            for device in &spec.devices {
2125                if let Ok((major, minor)) = get_device_major_minor(&device.path) {
2126                    let dev_type = get_device_type(&device.path).unwrap_or(LinuxDeviceType::C);
2127
2128                    // Build access string
2129                    let mut access = String::new();
2130                    if device.read {
2131                        access.push('r');
2132                    }
2133                    if device.write {
2134                        access.push('w');
2135                    }
2136                    if device.mknod {
2137                        access.push('m');
2138                    }
2139                    if access.is_empty() {
2140                        access = "rw".to_string();
2141                    }
2142
2143                    let rule = LinuxDeviceCgroupBuilder::default()
2144                        .allow(true)
2145                        .typ(dev_type)
2146                        .major(major)
2147                        .minor(minor)
2148                        .access(access)
2149                        .build()
2150                        .map_err(|e| {
2151                            AgentError::InvalidSpec(format!(
2152                                "failed to build device rule for {}: {}",
2153                                device.path, e
2154                            ))
2155                        })?;
2156                    rules.push(rule);
2157                } else {
2158                    tracing::warn!("Failed to get device info for {}, skipping", device.path);
2159                }
2160            }
2161
2162            // Auto-allow GPU devices in cgroup when gpu spec is set
2163            if let Some(ref gpu) = spec.resources.gpu {
2164                match gpu.vendor.as_str() {
2165                    "nvidia" => {
2166                        // Allow all nvidia devices (major 195 for nvidia GPUs)
2167                        let rule = LinuxDeviceCgroupBuilder::default()
2168                            .allow(true)
2169                            .typ(LinuxDeviceType::C)
2170                            .major(195i64)
2171                            .access("rwm".to_string())
2172                            .build()
2173                            .map_err(|e| {
2174                                AgentError::InvalidSpec(format!(
2175                                    "failed to build GPU cgroup rule: {e}"
2176                                ))
2177                            })?;
2178                        rules.push(rule);
2179
2180                        // nvidia-uvm (major 510 or check dynamically)
2181                        let uvm_rule = LinuxDeviceCgroupBuilder::default()
2182                            .allow(true)
2183                            .typ(LinuxDeviceType::C)
2184                            .major(510i64)
2185                            .access("rwm".to_string())
2186                            .build()
2187                            .map_err(|e| {
2188                                AgentError::InvalidSpec(format!(
2189                                    "failed to build GPU UVM cgroup rule: {e}"
2190                                ))
2191                            })?;
2192                        rules.push(uvm_rule);
2193                    }
2194                    "amd" => {
2195                        // AMD ROCm: /dev/dri/renderD* and /dev/dri/card* (major 226)
2196                        let dri_rule = LinuxDeviceCgroupBuilder::default()
2197                            .allow(true)
2198                            .typ(LinuxDeviceType::C)
2199                            .major(226i64)
2200                            .access("rwm".to_string())
2201                            .build()
2202                            .map_err(|e| {
2203                                AgentError::InvalidSpec(format!(
2204                                    "failed to build AMD DRI cgroup rule: {e}"
2205                                ))
2206                            })?;
2207                        rules.push(dri_rule);
2208
2209                        // /dev/kfd - AMD Kernel Fusion Driver for compute (major 234)
2210                        let kfd_rule = LinuxDeviceCgroupBuilder::default()
2211                            .allow(true)
2212                            .typ(LinuxDeviceType::C)
2213                            .major(234i64)
2214                            .access("rwm".to_string())
2215                            .build()
2216                            .map_err(|e| {
2217                                AgentError::InvalidSpec(format!(
2218                                    "failed to build AMD KFD cgroup rule: {e}"
2219                                ))
2220                            })?;
2221                        rules.push(kfd_rule);
2222                    }
2223                    "intel" => {
2224                        // Intel GPU: /dev/dri/renderD* and /dev/dri/card* (major 226)
2225                        let dri_rule = LinuxDeviceCgroupBuilder::default()
2226                            .allow(true)
2227                            .typ(LinuxDeviceType::C)
2228                            .major(226i64)
2229                            .access("rwm".to_string())
2230                            .build()
2231                            .map_err(|e| {
2232                                AgentError::InvalidSpec(format!(
2233                                    "failed to build Intel DRI cgroup rule: {e}"
2234                                ))
2235                            })?;
2236                        rules.push(dri_rule);
2237                    }
2238                    other => {
2239                        // Unknown vendor - allow DRI devices as a reasonable default
2240                        tracing::warn!(
2241                            vendor = %other,
2242                            "Unknown GPU vendor, allowing DRI devices (major 226)"
2243                        );
2244                        let dri_rule = LinuxDeviceCgroupBuilder::default()
2245                            .allow(true)
2246                            .typ(LinuxDeviceType::C)
2247                            .major(226i64)
2248                            .access("rwm".to_string())
2249                            .build()
2250                            .map_err(|e| {
2251                                AgentError::InvalidSpec(format!(
2252                                    "failed to build GPU DRI cgroup rule: {e}"
2253                                ))
2254                            })?;
2255                        rules.push(dri_rule);
2256                    }
2257                }
2258            }
2259        }
2260
2261        Ok(rules)
2262    }
2263
2264    /// Build Linux device entries for passthrough
2265    ///
2266    /// # Platform
2267    /// Every branch below walks `/dev/*` on the host to resolve major/minor
2268    /// numbers via `MetadataExt::rdev`. On Windows (where this module is
2269    /// compiled only to feed the WSL2 delegate's cross-platform spec path) we
2270    /// skip device discovery and return an empty list — the Linux side of the
2271    /// delegate re-runs this step inside the WSL2 distro.
2272    #[allow(clippy::unused_self, clippy::too_many_lines)]
2273    #[cfg_attr(not(unix), allow(clippy::unnecessary_wraps, clippy::needless_return))]
2274    fn build_devices(
2275        &self,
2276        spec: &ServiceSpec,
2277        gpu_indices: Option<&[u32]>,
2278        skip_gpu_defaults: bool,
2279    ) -> Result<Vec<oci_spec::runtime::LinuxDevice>> {
2280        #[cfg(not(unix))]
2281        {
2282            let _ = (spec, gpu_indices, skip_gpu_defaults);
2283            return Ok(Vec::new());
2284        }
2285
2286        #[cfg(unix)]
2287        {
2288            let mut devices = Vec::new();
2289
2290            for device in &spec.devices {
2291                if let Ok((major, minor)) = get_device_major_minor(&device.path) {
2292                    let dev_type = get_device_type(&device.path).unwrap_or(LinuxDeviceType::C);
2293
2294                    let linux_device = LinuxDeviceBuilder::default()
2295                        .path(device.path.clone())
2296                        .typ(dev_type)
2297                        .major(major)
2298                        .minor(minor)
2299                        .file_mode(0o666u32)
2300                        .uid(0u32)
2301                        .gid(0u32)
2302                        .build()
2303                        .map_err(|e| {
2304                            AgentError::InvalidSpec(format!(
2305                                "failed to build device {}: {}",
2306                                device.path, e
2307                            ))
2308                        })?;
2309
2310                    devices.push(linux_device);
2311                }
2312            }
2313
2314            // When CDI is providing GPU device descriptors the caller will
2315            // append the vendor-supplied entries; skip our hard-coded
2316            // `/dev/nvidiaN` enumeration so we don't end up with both sources
2317            // of truth.
2318            if skip_gpu_defaults {
2319                return Ok(devices);
2320            }
2321
2322            // Auto-inject GPU devices when gpu spec is set
2323            if let Some(ref gpu) = spec.resources.gpu {
2324                let indices: Vec<u32> =
2325                    gpu_indices.map_or_else(|| (0..gpu.count).collect(), <[u32]>::to_vec);
2326
2327                match gpu.vendor.as_str() {
2328                    "nvidia" => {
2329                        // Always needed: nvidiactl, nvidia-uvm, nvidia-uvm-tools
2330                        let always_devices =
2331                            ["/dev/nvidiactl", "/dev/nvidia-uvm", "/dev/nvidia-uvm-tools"];
2332                        for dev_path in &always_devices {
2333                            if let Ok((major, minor)) = get_device_major_minor(dev_path) {
2334                                let dev_type =
2335                                    get_device_type(dev_path).unwrap_or(LinuxDeviceType::C);
2336                                let linux_device = LinuxDeviceBuilder::default()
2337                                    .path((*dev_path).to_string())
2338                                    .typ(dev_type)
2339                                    .major(major)
2340                                    .minor(minor)
2341                                    .file_mode(0o666u32)
2342                                    .uid(0u32)
2343                                    .gid(0u32)
2344                                    .build()
2345                                    .map_err(|e| {
2346                                        AgentError::InvalidSpec(format!(
2347                                            "failed to build GPU device {dev_path}: {e}"
2348                                        ))
2349                                    })?;
2350                                devices.push(linux_device);
2351                            } else {
2352                                tracing::warn!(
2353                                    "GPU device {} not found on host, skipping",
2354                                    dev_path
2355                                );
2356                            }
2357                        }
2358
2359                        // Per-GPU devices: /dev/nvidia0, /dev/nvidia1, etc.
2360                        for i in &indices {
2361                            let dev_path = format!("/dev/nvidia{i}");
2362                            if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2363                                let dev_type =
2364                                    get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2365                                let linux_device = LinuxDeviceBuilder::default()
2366                                    .path(dev_path.clone())
2367                                    .typ(dev_type)
2368                                    .major(major)
2369                                    .minor(minor)
2370                                    .file_mode(0o666u32)
2371                                    .uid(0u32)
2372                                    .gid(0u32)
2373                                    .build()
2374                                    .map_err(|e| {
2375                                        AgentError::InvalidSpec(format!(
2376                                            "failed to build GPU device {dev_path}: {e}"
2377                                        ))
2378                                    })?;
2379                                devices.push(linux_device);
2380                            } else {
2381                                tracing::warn!(
2382                                    "GPU device {} not found on host, skipping",
2383                                    dev_path
2384                                );
2385                            }
2386                        }
2387                    }
2388                    "amd" => {
2389                        // AMD ROCm: /dev/kfd is always required for compute
2390                        let amd_always_devices = ["/dev/kfd"];
2391                        for dev_path in &amd_always_devices {
2392                            if let Ok((major, minor)) = get_device_major_minor(dev_path) {
2393                                let dev_type =
2394                                    get_device_type(dev_path).unwrap_or(LinuxDeviceType::C);
2395                                let linux_device = LinuxDeviceBuilder::default()
2396                                    .path((*dev_path).to_string())
2397                                    .typ(dev_type)
2398                                    .major(major)
2399                                    .minor(minor)
2400                                    .file_mode(0o666u32)
2401                                    .uid(0u32)
2402                                    .gid(0u32)
2403                                    .build()
2404                                    .map_err(|e| {
2405                                        AgentError::InvalidSpec(format!(
2406                                            "failed to build GPU device {dev_path}: {e}"
2407                                        ))
2408                                    })?;
2409                                devices.push(linux_device);
2410                            } else {
2411                                tracing::warn!(
2412                                    "GPU device {} not found on host, skipping",
2413                                    dev_path
2414                                );
2415                            }
2416                        }
2417
2418                        // DRI render nodes: /dev/dri/renderD128, renderD129, etc.
2419                        for i in &indices {
2420                            let dev_path = format!("/dev/dri/renderD{}", 128 + i);
2421                            if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2422                                let dev_type =
2423                                    get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2424                                let linux_device = LinuxDeviceBuilder::default()
2425                                    .path(dev_path.clone())
2426                                    .typ(dev_type)
2427                                    .major(major)
2428                                    .minor(minor)
2429                                    .file_mode(0o666u32)
2430                                    .uid(0u32)
2431                                    .gid(0u32)
2432                                    .build()
2433                                    .map_err(|e| {
2434                                        AgentError::InvalidSpec(format!(
2435                                            "failed to build GPU device {dev_path}: {e}"
2436                                        ))
2437                                    })?;
2438                                devices.push(linux_device);
2439                            } else {
2440                                tracing::warn!(
2441                                    "GPU device {} not found on host, skipping",
2442                                    dev_path
2443                                );
2444                            }
2445                        }
2446
2447                        // DRI card nodes: /dev/dri/card0, card1, etc.
2448                        for i in &indices {
2449                            let dev_path = format!("/dev/dri/card{i}");
2450                            if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2451                                let dev_type =
2452                                    get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2453                                let linux_device = LinuxDeviceBuilder::default()
2454                                    .path(dev_path.clone())
2455                                    .typ(dev_type)
2456                                    .major(major)
2457                                    .minor(minor)
2458                                    .file_mode(0o666u32)
2459                                    .uid(0u32)
2460                                    .gid(0u32)
2461                                    .build()
2462                                    .map_err(|e| {
2463                                        AgentError::InvalidSpec(format!(
2464                                            "failed to build GPU device {dev_path}: {e}"
2465                                        ))
2466                                    })?;
2467                                devices.push(linux_device);
2468                            } else {
2469                                tracing::warn!(
2470                                    "GPU device {} not found on host, skipping",
2471                                    dev_path
2472                                );
2473                            }
2474                        }
2475                    }
2476                    "intel" => {
2477                        // Intel GPU: DRI render nodes /dev/dri/renderD128, etc.
2478                        for i in &indices {
2479                            let dev_path = format!("/dev/dri/renderD{}", 128 + i);
2480                            if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2481                                let dev_type =
2482                                    get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2483                                let linux_device = LinuxDeviceBuilder::default()
2484                                    .path(dev_path.clone())
2485                                    .typ(dev_type)
2486                                    .major(major)
2487                                    .minor(minor)
2488                                    .file_mode(0o666u32)
2489                                    .uid(0u32)
2490                                    .gid(0u32)
2491                                    .build()
2492                                    .map_err(|e| {
2493                                        AgentError::InvalidSpec(format!(
2494                                            "failed to build GPU device {dev_path}: {e}"
2495                                        ))
2496                                    })?;
2497                                devices.push(linux_device);
2498                            } else {
2499                                tracing::warn!(
2500                                    "GPU device {} not found on host, skipping",
2501                                    dev_path
2502                                );
2503                            }
2504                        }
2505
2506                        // Intel DRI card nodes: /dev/dri/card0, card1, etc.
2507                        for i in &indices {
2508                            let dev_path = format!("/dev/dri/card{i}");
2509                            if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2510                                let dev_type =
2511                                    get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2512                                let linux_device = LinuxDeviceBuilder::default()
2513                                    .path(dev_path.clone())
2514                                    .typ(dev_type)
2515                                    .major(major)
2516                                    .minor(minor)
2517                                    .file_mode(0o666u32)
2518                                    .uid(0u32)
2519                                    .gid(0u32)
2520                                    .build()
2521                                    .map_err(|e| {
2522                                        AgentError::InvalidSpec(format!(
2523                                            "failed to build GPU device {dev_path}: {e}"
2524                                        ))
2525                                    })?;
2526                                devices.push(linux_device);
2527                            } else {
2528                                tracing::warn!(
2529                                    "GPU device {} not found on host, skipping",
2530                                    dev_path
2531                                );
2532                            }
2533                        }
2534                    }
2535                    other => {
2536                        // Unknown vendor - try DRI render nodes as default
2537                        tracing::warn!(
2538                            vendor = %other,
2539                            "Unknown GPU vendor, attempting DRI device passthrough"
2540                        );
2541                        for i in &indices {
2542                            let dev_path = format!("/dev/dri/renderD{}", 128 + i);
2543                            if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2544                                let dev_type =
2545                                    get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2546                                let linux_device = LinuxDeviceBuilder::default()
2547                                    .path(dev_path.clone())
2548                                    .typ(dev_type)
2549                                    .major(major)
2550                                    .minor(minor)
2551                                    .file_mode(0o666u32)
2552                                    .uid(0u32)
2553                                    .gid(0u32)
2554                                    .build()
2555                                    .map_err(|e| {
2556                                        AgentError::InvalidSpec(format!(
2557                                            "failed to build GPU device {dev_path}: {e}"
2558                                        ))
2559                                    })?;
2560                                devices.push(linux_device);
2561                            } else {
2562                                tracing::warn!(
2563                                    "GPU device {} not found on host, skipping",
2564                                    dev_path
2565                                );
2566                            }
2567                        }
2568                    }
2569                }
2570            }
2571
2572            Ok(devices)
2573        } // end #[cfg(unix)]
2574    }
2575
2576    /// Generate the OCI spec and write config.json to the bundle directory
2577    ///
2578    /// Unlike `build()`, this does NOT create the bundle directory or set up rootfs.
2579    /// Use this when the bundle directory and rootfs already exist (e.g., rootfs was
2580    /// extracted directly by `LayerUnpacker`).
2581    ///
2582    /// # Errors
2583    /// Returns an error if the OCI spec cannot be built or config.json cannot be written.
2584    ///
2585    /// # Returns
2586    /// The path to the bundle directory on success
2587    pub async fn write_config(
2588        &self,
2589        container_id: &ContainerId,
2590        spec: &ServiceSpec,
2591    ) -> Result<PathBuf> {
2592        // Generate OCI runtime spec
2593        let oci_spec = self
2594            .build_spec_only(container_id, spec, &self.volume_paths)
2595            .await?;
2596
2597        // Write config.json
2598        let config_path = self.bundle_dir.join("config.json");
2599        let config_json =
2600            serde_json::to_string_pretty(&oci_spec).map_err(|e| AgentError::CreateFailed {
2601                id: container_id.to_string(),
2602                reason: format!("failed to serialize OCI spec: {e}"),
2603            })?;
2604
2605        fs::write(&config_path, config_json)
2606            .await
2607            .map_err(|e| AgentError::CreateFailed {
2608                id: container_id.to_string(),
2609                reason: format!("failed to write config.json: {e}"),
2610            })?;
2611
2612        tracing::debug!(
2613            "Wrote OCI config.json at {} for container {}",
2614            config_path.display(),
2615            container_id
2616        );
2617
2618        Ok(self.bundle_dir.clone())
2619    }
2620
2621    /// Resolve command from `ServiceSpec` and optional image config following Docker/OCI semantics
2622    ///
2623    /// Resolution order:
2624    /// 1. spec entrypoint + args -> use those
2625    /// 2. spec entrypoint only -> use entrypoint
2626    /// 3. spec args only -> use args
2627    /// 4. `image_config` entrypoint/cmd -> use `image_config.full_command()`
2628    /// 5. fallback to /bin/sh
2629    fn resolve_command_from_spec(
2630        spec: &ServiceSpec,
2631        image_config: Option<&zlayer_registry::ImageConfig>,
2632    ) -> Vec<String> {
2633        let mut args = Vec::new();
2634
2635        match (&spec.command.entrypoint, &spec.command.args) {
2636            (Some(entrypoint), Some(cmd_args)) => {
2637                args.extend_from_slice(entrypoint);
2638                args.extend_from_slice(cmd_args);
2639            }
2640            (Some(entrypoint), None) => {
2641                args.extend_from_slice(entrypoint);
2642            }
2643            (None, Some(cmd_args)) if !cmd_args.is_empty() => {
2644                args.extend_from_slice(cmd_args);
2645            }
2646            _ => {
2647                // No spec command - try image config
2648                if let Some(img_cmd) =
2649                    image_config.and_then(zlayer_registry::ImageConfig::full_command)
2650                {
2651                    if img_cmd.is_empty() {
2652                        args.push("/bin/sh".to_string());
2653                    } else {
2654                        args.extend(img_cmd);
2655                    }
2656                } else {
2657                    args.push("/bin/sh".to_string());
2658                }
2659            }
2660        }
2661
2662        args
2663    }
2664
2665    /// Clean up a bundle directory
2666    ///
2667    /// Removes the bundle directory and all its contents.
2668    ///
2669    /// # Errors
2670    /// Returns an error if the bundle directory cannot be removed.
2671    pub async fn cleanup(&self) -> Result<()> {
2672        if self.bundle_dir.exists() {
2673            fs::remove_dir_all(&self.bundle_dir)
2674                .await
2675                .map_err(|e| AgentError::CreateFailed {
2676                    id: "cleanup".to_string(),
2677                    reason: format!(
2678                        "failed to remove bundle directory {}: {}",
2679                        self.bundle_dir.display(),
2680                        e
2681                    ),
2682                })?;
2683        }
2684        Ok(())
2685    }
2686}
2687
2688/// Create a bundle for a container
2689///
2690/// Convenience function that creates a bundle in the default location.
2691///
2692/// # Errors
2693/// Returns an error if bundle creation fails.
2694///
2695/// # Platform
2696/// Unix-only — wraps [`BundleBuilder::build`], which uses
2697/// `tokio::fs::symlink` (not available on Windows). Windows callers should
2698/// use [`BundleBuilder::build_spec_only`] directly and pipe the result into
2699/// a WSL2 delegate.
2700#[cfg(unix)]
2701pub async fn create_bundle(
2702    container_id: &ContainerId,
2703    spec: &ServiceSpec,
2704    rootfs_path: Option<PathBuf>,
2705) -> Result<PathBuf> {
2706    let mut builder =
2707        BundleBuilder::for_container(container_id).with_host_network(spec.host_network);
2708
2709    if let Some(rootfs) = rootfs_path {
2710        builder = builder.with_rootfs(rootfs);
2711    }
2712
2713    builder.build(container_id, spec).await
2714}
2715
2716/// Clean up a container's bundle
2717///
2718/// Convenience function to remove a bundle from the default location.
2719///
2720/// # Errors
2721/// Returns an error if cleanup fails.
2722pub async fn cleanup_bundle(container_id: &ContainerId) -> Result<()> {
2723    let builder = BundleBuilder::for_container(container_id);
2724    builder.cleanup().await
2725}
2726
2727#[cfg(test)]
2728mod tests {
2729    use super::*;
2730    use zlayer_spec::*;
2731
2732    fn mock_spec() -> ServiceSpec {
2733        serde_yaml::from_str::<DeploymentSpec>(
2734            r"
2735version: v1
2736deployment: test
2737services:
2738  test:
2739    rtype: service
2740    image:
2741      name: test:latest
2742    endpoints:
2743      - name: http
2744        protocol: http
2745        port: 8080
2746",
2747        )
2748        .unwrap()
2749        .services
2750        .remove("test")
2751        .unwrap()
2752    }
2753
2754    #[cfg(target_os = "linux")]
2755    fn mock_spec_with_resources() -> ServiceSpec {
2756        serde_yaml::from_str::<DeploymentSpec>(
2757            r"
2758version: v1
2759deployment: test
2760services:
2761  test:
2762    rtype: service
2763    image:
2764      name: test:latest
2765    resources:
2766      cpu: 0.5
2767      memory: 512Mi
2768    env:
2769      MY_VAR: my_value
2770      ANOTHER: value2
2771    endpoints:
2772      - name: http
2773        protocol: http
2774        port: 8080
2775",
2776        )
2777        .unwrap()
2778        .services
2779        .remove("test")
2780        .unwrap()
2781    }
2782
2783    #[cfg(target_os = "linux")]
2784    fn mock_privileged_spec() -> ServiceSpec {
2785        serde_yaml::from_str::<DeploymentSpec>(
2786            r"
2787version: v1
2788deployment: test
2789services:
2790  test:
2791    rtype: service
2792    image:
2793      name: test:latest
2794    privileged: true
2795    endpoints:
2796      - name: http
2797        protocol: http
2798        port: 8080
2799",
2800        )
2801        .unwrap()
2802        .services
2803        .remove("test")
2804        .unwrap()
2805    }
2806
2807    #[test]
2808    fn test_parse_memory_string() {
2809        assert_eq!(parse_memory_string("512Mi").unwrap(), 512 * 1024 * 1024);
2810        assert_eq!(parse_memory_string("1Gi").unwrap(), 1024 * 1024 * 1024);
2811        assert_eq!(parse_memory_string("2G").unwrap(), 2 * 1000 * 1000 * 1000);
2812        assert_eq!(parse_memory_string("1024").unwrap(), 1024);
2813        assert_eq!(parse_memory_string("512Ki").unwrap(), 512 * 1024);
2814    }
2815
2816    #[test]
2817    fn test_parse_memory_string_errors() {
2818        assert!(parse_memory_string("").is_err());
2819        assert!(parse_memory_string("abc").is_err());
2820        assert!(parse_memory_string("12.5Mi").is_err());
2821    }
2822
2823    #[test]
2824    fn test_bundle_builder_new() {
2825        let builder = BundleBuilder::new("/tmp/test-bundle".into());
2826        assert_eq!(builder.bundle_dir(), Path::new("/tmp/test-bundle"));
2827        assert!(builder.rootfs_path.is_none());
2828    }
2829
2830    #[test]
2831    fn test_bundle_builder_for_container() {
2832        let dirs = zlayer_paths::ZLayerDirs::system_default();
2833        let id = ContainerId::new("myservice".to_string(), 1);
2834        let builder = BundleBuilder::for_container(&id);
2835        assert_eq!(builder.bundle_dir(), dirs.bundles().join("myservice-rep-1"));
2836    }
2837
2838    #[test]
2839    fn test_bundle_builder_with_rootfs() {
2840        let dirs = zlayer_paths::ZLayerDirs::system_default();
2841        let builder = BundleBuilder::new("/tmp/test-bundle".into())
2842            .with_rootfs(dirs.rootfs().join("myimage"));
2843        assert_eq!(builder.rootfs_path, Some(dirs.rootfs().join("myimage")));
2844    }
2845
2846    #[cfg(target_os = "linux")]
2847    #[tokio::test]
2848    async fn test_build_oci_spec_basic() {
2849        let id = ContainerId::new("test".to_string(), 1);
2850        let spec = mock_spec();
2851        let builder = BundleBuilder::new("/tmp/test-bundle".into());
2852
2853        let oci_spec = builder
2854            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
2855            .await
2856            .unwrap();
2857
2858        assert_eq!(oci_spec.version(), "1.0.2");
2859        assert!(oci_spec.root().is_some());
2860        assert_eq!(
2861            oci_spec.root().as_ref().unwrap().path(),
2862            std::path::Path::new("rootfs")
2863        );
2864        assert!(oci_spec.process().is_some());
2865        assert!(oci_spec.linux().is_some());
2866    }
2867
2868    #[cfg(target_os = "linux")]
2869    #[tokio::test]
2870    async fn test_build_oci_spec_with_resources() {
2871        let id = ContainerId::new("test".to_string(), 1);
2872        let spec = mock_spec_with_resources();
2873        let builder = BundleBuilder::new("/tmp/test-bundle".into());
2874
2875        let oci_spec = builder
2876            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
2877            .await
2878            .unwrap();
2879
2880        // Check that resources are set
2881        let linux = oci_spec.linux().as_ref().unwrap();
2882        let resources = linux.resources().as_ref().unwrap();
2883
2884        // Check CPU
2885        let cpu = resources.cpu().as_ref().unwrap();
2886        assert_eq!(cpu.quota(), Some(50_000)); // 0.5 cores * 100000
2887        assert_eq!(cpu.period(), Some(100_000));
2888
2889        // Check memory
2890        let memory = resources.memory().as_ref().unwrap();
2891        assert_eq!(memory.limit(), Some(512 * 1024 * 1024)); // 512Mi
2892    }
2893
2894    #[cfg(target_os = "linux")]
2895    #[tokio::test]
2896    async fn test_build_oci_spec_privileged() {
2897        let id = ContainerId::new("test".to_string(), 1);
2898        let spec = mock_privileged_spec();
2899        let builder = BundleBuilder::new("/tmp/test-bundle".into());
2900
2901        let oci_spec = builder
2902            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
2903            .await
2904            .unwrap();
2905
2906        // Check that all capabilities are set
2907        let process = oci_spec.process().as_ref().unwrap();
2908        let caps = process.capabilities().as_ref().unwrap();
2909        let bounding = caps.bounding().as_ref().unwrap();
2910
2911        // Should have all capabilities
2912        assert!(bounding.contains(&Capability::SysAdmin));
2913        assert!(bounding.contains(&Capability::NetAdmin));
2914
2915        // Check that masked paths are NOT set for privileged
2916        let linux = oci_spec.linux().as_ref().unwrap();
2917        assert!(
2918            linux.masked_paths().is_none() || linux.masked_paths().as_ref().unwrap().is_empty()
2919        );
2920    }
2921
2922    #[cfg(target_os = "linux")]
2923    #[tokio::test]
2924    async fn test_build_oci_spec_environment() {
2925        let id = ContainerId::new("test".to_string(), 1);
2926        let spec = mock_spec_with_resources();
2927        let builder = BundleBuilder::new("/tmp/test-bundle".into())
2928            .with_env("EXTRA_VAR".to_string(), "extra_value".to_string());
2929
2930        let oci_spec = builder
2931            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
2932            .await
2933            .unwrap();
2934
2935        let process = oci_spec.process().as_ref().unwrap();
2936        let env = process.env().as_ref().unwrap();
2937
2938        // Check service env vars are present
2939        assert!(env.iter().any(|e| e == "MY_VAR=my_value"));
2940        assert!(env.iter().any(|e| e == "ANOTHER=value2"));
2941        // Check extra env var is present
2942        assert!(env.iter().any(|e| e == "EXTRA_VAR=extra_value"));
2943        // Check PATH is present
2944        assert!(env.iter().any(|e| e.starts_with("PATH=")));
2945    }
2946
2947    #[cfg(target_os = "linux")]
2948    #[tokio::test]
2949    async fn test_build_namespaces() {
2950        let id = ContainerId::new("test".to_string(), 1);
2951        let spec = mock_spec();
2952        let builder = BundleBuilder::new("/tmp/test-bundle".into());
2953
2954        let oci_spec = builder
2955            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
2956            .await
2957            .unwrap();
2958        let linux = oci_spec.linux().as_ref().unwrap();
2959        let namespaces = linux.namespaces().as_ref().unwrap();
2960
2961        // Check we have the expected namespaces
2962        let namespace_types: Vec<_> = namespaces
2963            .iter()
2964            .map(oci_spec::runtime::LinuxNamespace::typ)
2965            .collect();
2966        assert!(namespace_types.contains(&LinuxNamespaceType::Pid));
2967        assert!(namespace_types.contains(&LinuxNamespaceType::Ipc));
2968        assert!(namespace_types.contains(&LinuxNamespaceType::Uts));
2969        assert!(namespace_types.contains(&LinuxNamespaceType::Mount));
2970        assert!(namespace_types.contains(&LinuxNamespaceType::Network));
2971    }
2972
2973    #[cfg(target_os = "linux")]
2974    #[tokio::test]
2975    async fn test_build_namespaces_host_network() {
2976        let id = ContainerId::new("test".to_string(), 1);
2977        let spec = mock_spec();
2978        let builder = BundleBuilder::new("/tmp/test-bundle".into()).with_host_network(true);
2979
2980        let oci_spec = builder
2981            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
2982            .await
2983            .unwrap();
2984        let linux = oci_spec.linux().as_ref().unwrap();
2985        let namespaces = linux.namespaces().as_ref().unwrap();
2986
2987        // Check we have the expected namespaces (NO Network namespace)
2988        let namespace_types: Vec<_> = namespaces
2989            .iter()
2990            .map(oci_spec::runtime::LinuxNamespace::typ)
2991            .collect();
2992        assert!(namespace_types.contains(&LinuxNamespaceType::Pid));
2993        assert!(namespace_types.contains(&LinuxNamespaceType::Ipc));
2994        assert!(namespace_types.contains(&LinuxNamespaceType::Uts));
2995        assert!(namespace_types.contains(&LinuxNamespaceType::Mount));
2996        assert!(
2997            !namespace_types.contains(&LinuxNamespaceType::Network),
2998            "Network namespace should NOT be present in host_network mode"
2999        );
3000    }
3001
3002    #[test]
3003    fn test_build_default_mounts() {
3004        let spec = mock_spec();
3005        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3006
3007        let mounts = builder.build_default_mounts(&spec).unwrap();
3008
3009        // Check we have the expected mounts
3010        let mount_destinations: Vec<_> = mounts
3011            .iter()
3012            .map(|m| m.destination().to_string_lossy().to_string())
3013            .collect();
3014        assert!(mount_destinations.contains(&"/proc".to_string()));
3015        assert!(mount_destinations.contains(&"/dev".to_string()));
3016        assert!(mount_destinations.contains(&"/dev/pts".to_string()));
3017        assert!(mount_destinations.contains(&"/dev/shm".to_string()));
3018        assert!(mount_destinations.contains(&"/sys".to_string()));
3019    }
3020
3021    #[test]
3022    fn test_build_storage_mounts_bind() {
3023        let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3024            r"
3025version: v1
3026deployment: test
3027services:
3028  test:
3029    image:
3030      name: test:latest
3031    storage:
3032      - type: bind
3033        source: /host/data
3034        target: /app/data
3035        readonly: true
3036",
3037        )
3038        .unwrap()
3039        .services
3040        .remove("test")
3041        .unwrap();
3042
3043        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3044        let volume_paths = std::collections::HashMap::new();
3045
3046        let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3047
3048        assert_eq!(mounts.len(), 1);
3049        assert_eq!(mounts[0].destination().to_string_lossy(), "/app/data");
3050        assert_eq!(
3051            mounts[0]
3052                .source()
3053                .as_ref()
3054                .map(|s| s.to_string_lossy().to_string()),
3055            Some("/host/data".to_string())
3056        );
3057        let options = mounts[0].options().as_ref().unwrap();
3058        assert!(options.contains(&"rbind".to_string()));
3059        assert!(options.contains(&"ro".to_string()));
3060    }
3061
3062    #[test]
3063    fn test_build_storage_mounts_named() {
3064        let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3065            r"
3066version: v1
3067deployment: test
3068services:
3069  test:
3070    image:
3071      name: test:latest
3072    storage:
3073      - type: named
3074        name: my-volume
3075        target: /app/data
3076",
3077        )
3078        .unwrap()
3079        .services
3080        .remove("test")
3081        .unwrap();
3082
3083        let dirs = zlayer_paths::ZLayerDirs::system_default();
3084        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3085        let mut volume_paths = std::collections::HashMap::new();
3086        volume_paths.insert("my-volume".to_string(), dirs.volumes().join("my-volume"));
3087
3088        let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3089
3090        assert_eq!(mounts.len(), 1);
3091        assert_eq!(mounts[0].destination().to_string_lossy(), "/app/data");
3092        assert_eq!(
3093            mounts[0]
3094                .source()
3095                .as_ref()
3096                .map(|s| s.to_string_lossy().to_string()),
3097            Some(
3098                dirs.volumes()
3099                    .join("my-volume")
3100                    .to_string_lossy()
3101                    .into_owned()
3102            )
3103        );
3104    }
3105
3106    #[test]
3107    fn test_build_storage_mounts_tmpfs() {
3108        let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3109            r"
3110version: v1
3111deployment: test
3112services:
3113  test:
3114    image:
3115      name: test:latest
3116    storage:
3117      - type: tmpfs
3118        target: /app/tmp
3119        size: 256Mi
3120        mode: 1777
3121",
3122        )
3123        .unwrap()
3124        .services
3125        .remove("test")
3126        .unwrap();
3127
3128        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3129        let volume_paths = std::collections::HashMap::new();
3130
3131        let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3132
3133        assert_eq!(mounts.len(), 1);
3134        assert_eq!(mounts[0].destination().to_string_lossy(), "/app/tmp");
3135        assert_eq!(mounts[0].typ().as_ref().map(String::as_str), Some("tmpfs"));
3136        let options = mounts[0].options().as_ref().unwrap();
3137        assert!(options.iter().any(|o| o.starts_with("size=")));
3138        assert!(options.iter().any(|o| o.starts_with("mode=")));
3139    }
3140
3141    #[test]
3142    fn test_build_storage_mounts_multiple() {
3143        let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3144            r"
3145version: v1
3146deployment: test
3147services:
3148  test:
3149    image:
3150      name: test:latest
3151    storage:
3152      - type: bind
3153        source: /etc/config
3154        target: /app/config
3155        readonly: true
3156      - type: named
3157        name: app-data
3158        target: /app/data
3159      - type: tmpfs
3160        target: /app/tmp
3161",
3162        )
3163        .unwrap()
3164        .services
3165        .remove("test")
3166        .unwrap();
3167
3168        let dirs = zlayer_paths::ZLayerDirs::system_default();
3169        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3170        let mut volume_paths = std::collections::HashMap::new();
3171        volume_paths.insert("app-data".to_string(), dirs.volumes().join("app-data"));
3172
3173        let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3174
3175        assert_eq!(mounts.len(), 3);
3176
3177        // Verify each mount is correct type
3178        let destinations: Vec<String> = mounts
3179            .iter()
3180            .map(|m| m.destination().to_string_lossy().to_string())
3181            .collect();
3182        assert!(destinations.contains(&"/app/config".to_string()));
3183        assert!(destinations.contains(&"/app/data".to_string()));
3184        assert!(destinations.contains(&"/app/tmp".to_string()));
3185    }
3186
3187    #[test]
3188    fn test_build_storage_mounts_anonymous_missing_path() {
3189        let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3190            r"
3191version: v1
3192deployment: test
3193services:
3194  test:
3195    image:
3196      name: test:latest
3197    storage:
3198      - type: anonymous
3199        target: /app/cache
3200",
3201        )
3202        .unwrap()
3203        .services
3204        .remove("test")
3205        .unwrap();
3206
3207        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3208        let volume_paths = std::collections::HashMap::new(); // No path provided
3209
3210        let result = builder.build_storage_mounts(&spec, &volume_paths);
3211
3212        // Should fail because anonymous volume path not prepared
3213        assert!(result.is_err());
3214    }
3215
3216    #[cfg(target_os = "linux")]
3217    #[tokio::test]
3218    async fn test_oci_spec_includes_storage_mounts() {
3219        let id = ContainerId::new("test".to_string(), 1);
3220        let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3221            r"
3222version: v1
3223deployment: test
3224services:
3225  test:
3226    image:
3227      name: test:latest
3228    storage:
3229      - type: bind
3230        source: /host/data
3231        target: /app/data
3232      - type: tmpfs
3233        target: /app/tmp
3234",
3235        )
3236        .unwrap()
3237        .services
3238        .remove("test")
3239        .unwrap();
3240
3241        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3242        let volume_paths = std::collections::HashMap::new();
3243
3244        let oci_spec = builder
3245            .build_spec_only(&id, &spec, &volume_paths)
3246            .await
3247            .unwrap();
3248
3249        // Verify the OCI spec includes storage mounts
3250        let mounts = oci_spec.mounts().as_ref().unwrap();
3251        let destinations: Vec<String> = mounts
3252            .iter()
3253            .map(|m| m.destination().to_string_lossy().to_string())
3254            .collect();
3255
3256        // Should include both default mounts and storage mounts
3257        assert!(destinations.contains(&"/proc".to_string())); // default
3258        assert!(destinations.contains(&"/dev".to_string())); // default
3259        assert!(destinations.contains(&"/app/data".to_string())); // storage bind
3260        assert!(destinations.contains(&"/app/tmp".to_string())); // storage tmpfs
3261    }
3262
3263    fn mock_gpu_spec(vendor: &str, count: u32) -> ServiceSpec {
3264        let yaml = format!(
3265            "
3266version: v1
3267deployment: test
3268services:
3269  test:
3270    rtype: service
3271    image:
3272      name: test:latest
3273    resources:
3274      gpu:
3275        count: {count}
3276        vendor: {vendor}
3277    endpoints:
3278      - name: http
3279        protocol: http
3280        port: 8080
3281"
3282        );
3283        serde_yaml::from_str::<DeploymentSpec>(&yaml)
3284            .unwrap()
3285            .services
3286            .remove("test")
3287            .unwrap()
3288    }
3289
3290    fn write_nvidia_cdi_fixture(dir: &std::path::Path, json: &str) {
3291        std::fs::write(dir.join("nvidia.json"), json).unwrap();
3292    }
3293
3294    fn nvidia_cdi_fixture() -> &'static str {
3295        r#"{
3296            "cdiVersion": "0.6.0",
3297            "kind": "nvidia.com/gpu",
3298            "devices": [{
3299                "name": "0",
3300                "containerEdits": {
3301                    "deviceNodes": [
3302                        {"path": "/dev/nvidia0", "type": "c", "major": 195, "minor": 0}
3303                    ],
3304                    "env": ["NVIDIA_VISIBLE_DEVICES=0"],
3305                    "hooks": {
3306                        "createContainer": [{
3307                            "path": "/usr/bin/nvidia-container-runtime-hook",
3308                            "args": ["nvidia-container-runtime-hook", "prestart"]
3309                        }]
3310                    }
3311                }
3312            }]
3313        }"#
3314    }
3315
3316    #[cfg(target_os = "linux")]
3317    #[tokio::test]
3318    async fn gpu_spec_translates_to_cdi_device_nodes() {
3319        let dir = tempfile::tempdir().unwrap();
3320        write_nvidia_cdi_fixture(dir.path(), nvidia_cdi_fixture());
3321        let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3322
3323        let id = ContainerId::new("test".to_string(), 1);
3324        let spec = mock_gpu_spec("nvidia", 1);
3325        let builder = BundleBuilder::new("/tmp/test-bundle-cdi".into()).with_cdi_registry(registry);
3326
3327        let oci_spec = builder
3328            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3329            .await
3330            .expect("build with CDI fixture");
3331
3332        // CDI device node merged into linux.devices
3333        let linux = oci_spec.linux().as_ref().expect("linux config present");
3334        let devices = linux.devices().as_ref().expect("devices present");
3335        assert!(
3336            devices
3337                .iter()
3338                .any(|d| d.path() == std::path::Path::new("/dev/nvidia0")),
3339            "expected /dev/nvidia0 from CDI fixture; got {:?}",
3340            devices
3341                .iter()
3342                .map(oci_spec::runtime::LinuxDevice::path)
3343                .collect::<Vec<_>>()
3344        );
3345
3346        // CDI env var merged into process.env
3347        let process = oci_spec.process().as_ref().expect("process present");
3348        let env = process.env().as_ref().expect("env present");
3349        assert!(
3350            env.iter().any(|e| e == "NVIDIA_VISIBLE_DEVICES=0"),
3351            "expected NVIDIA_VISIBLE_DEVICES=0 in env; got {env:?}"
3352        );
3353
3354        // CDI hook merged into hooks.createContainer
3355        let hooks = oci_spec.hooks().as_ref().expect("hooks present");
3356        let create_container = hooks
3357            .create_container()
3358            .as_ref()
3359            .expect("createContainer hooks present");
3360        assert_eq!(create_container.len(), 1);
3361        assert_eq!(
3362            create_container[0].path(),
3363            &std::path::PathBuf::from("/usr/bin/nvidia-container-runtime-hook")
3364        );
3365    }
3366
3367    #[tokio::test]
3368    async fn gpu_spec_with_missing_cdi_returns_error() {
3369        // Empty tempdir — no CDI specs installed at all.
3370        let dir = tempfile::tempdir().unwrap();
3371        let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3372
3373        let id = ContainerId::new("test".to_string(), 1);
3374        let spec = mock_gpu_spec("nvidia", 1);
3375        let builder =
3376            BundleBuilder::new("/tmp/test-bundle-cdi-missing".into()).with_cdi_registry(registry);
3377
3378        let err = builder
3379            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3380            .await
3381            .expect_err("should fail when CDI registry is empty");
3382
3383        match err {
3384            AgentError::InvalidSpec(msg) => {
3385                assert!(
3386                    msg.contains("nvidia") || msg.contains("CDI"),
3387                    "error should mention CDI / vendor; got: {msg}"
3388                );
3389            }
3390            other => panic!("expected InvalidSpec, got {other:?}"),
3391        }
3392    }
3393
3394    #[tokio::test]
3395    async fn gpu_spec_with_unknown_device_returns_error() {
3396        // Spec has device "0" but the request will ask for two GPUs (so the
3397        // resolver will look for "1" and fail).
3398        let dir = tempfile::tempdir().unwrap();
3399        write_nvidia_cdi_fixture(dir.path(), nvidia_cdi_fixture());
3400        let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3401
3402        let id = ContainerId::new("test".to_string(), 1);
3403        let spec = mock_gpu_spec("nvidia", 2);
3404        let builder =
3405            BundleBuilder::new("/tmp/test-bundle-cdi-unknown".into()).with_cdi_registry(registry);
3406
3407        let err = builder
3408            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3409            .await
3410            .expect_err("should fail when device '1' is not declared");
3411        match err {
3412            AgentError::InvalidSpec(msg) => {
3413                assert!(
3414                    msg.contains("'1'") || msg.contains("device"),
3415                    "error should mention the missing device; got: {msg}"
3416                );
3417            }
3418            other => panic!("expected InvalidSpec, got {other:?}"),
3419        }
3420    }
3421
3422    #[cfg(target_os = "linux")]
3423    #[tokio::test]
3424    async fn gpu_spec_with_all_devices_expands_to_all_in_spec() {
3425        // Fixture with two declared devices ("0" and "1").
3426        let dir = tempfile::tempdir().unwrap();
3427        let fixture = r#"{
3428            "cdiVersion": "0.6.0",
3429            "kind": "nvidia.com/gpu",
3430            "devices": [
3431                {
3432                    "name": "0",
3433                    "containerEdits": {
3434                        "env": ["NVIDIA_VISIBLE_DEVICES=0"],
3435                        "deviceNodes": [
3436                            {"path": "/dev/nvidia0", "type": "c", "major": 195, "minor": 0}
3437                        ]
3438                    }
3439                },
3440                {
3441                    "name": "1",
3442                    "containerEdits": {
3443                        "env": ["NVIDIA_VISIBLE_DEVICES=1"],
3444                        "deviceNodes": [
3445                            {"path": "/dev/nvidia1", "type": "c", "major": 195, "minor": 1}
3446                        ]
3447                    }
3448                }
3449            ]
3450        }"#;
3451        write_nvidia_cdi_fixture(dir.path(), fixture);
3452        let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3453
3454        // Resolve "all" via the registry directly to validate expansion
3455        // semantics independently of how we map count -> names.
3456        let edits = registry
3457            .resolve_for_kind("nvidia.com/gpu", &["all".to_string()])
3458            .expect("resolve all");
3459        assert_eq!(edits.len(), 2);
3460
3461        // Now build the bundle for a 2-GPU service and confirm both nodes
3462        // land in linux.devices.
3463        let id = ContainerId::new("test".to_string(), 1);
3464        let spec = mock_gpu_spec("nvidia", 2);
3465        let builder =
3466            BundleBuilder::new("/tmp/test-bundle-cdi-all".into()).with_cdi_registry(registry);
3467
3468        let oci_spec = builder
3469            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3470            .await
3471            .expect("build with 2-device fixture");
3472
3473        let devices = oci_spec
3474            .linux()
3475            .as_ref()
3476            .unwrap()
3477            .devices()
3478            .as_ref()
3479            .expect("devices present");
3480        let paths: Vec<_> = devices.iter().map(|d| d.path().clone()).collect();
3481        assert!(paths.contains(&std::path::PathBuf::from("/dev/nvidia0")));
3482        assert!(paths.contains(&std::path::PathBuf::from("/dev/nvidia1")));
3483    }
3484
3485    /// Build the standard fixture-backed CDI registry used by the MPS /
3486    /// time-slicing tests. Identical to the helper used by the 5.A CDI
3487    /// tests above but expressed as a closure-style helper to keep each test
3488    /// self-contained.
3489    fn build_nvidia_cdi_registry(dir: &std::path::Path) -> std::sync::Arc<crate::cdi::CdiRegistry> {
3490        write_nvidia_cdi_fixture(dir, nvidia_cdi_fixture());
3491        std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir]))
3492    }
3493
3494    #[cfg(target_os = "linux")]
3495    #[tokio::test]
3496    async fn gpu_spec_with_mps_sharing_injects_env_and_mounts() {
3497        // Stage host-side MPS directories in a tempdir so the resolver's
3498        // `is_dir()` check passes without touching /tmp/nvidia-mps on the
3499        // real host.
3500        let cdi_dir = tempfile::tempdir().unwrap();
3501        let mps_root = tempfile::tempdir().unwrap();
3502        let pipe_dir = mps_root.path().join("nvidia-mps");
3503        let log_dir = mps_root.path().join("nvidia-log");
3504        std::fs::create_dir(&pipe_dir).unwrap();
3505        std::fs::create_dir(&log_dir).unwrap();
3506        let registry = build_nvidia_cdi_registry(cdi_dir.path());
3507
3508        let id = ContainerId::new("test".to_string(), 1);
3509        let mut spec = mock_gpu_spec("nvidia", 1);
3510        let gpu = spec.resources.gpu.as_mut().expect("gpu spec set");
3511        gpu.sharing = Some(zlayer_spec::GpuSharingMode::Mps);
3512        gpu.mps_pipe_dir = Some(pipe_dir.to_string_lossy().into_owned());
3513        gpu.mps_log_dir = Some(log_dir.to_string_lossy().into_owned());
3514
3515        let builder =
3516            BundleBuilder::new("/tmp/test-bundle-mps-env".into()).with_cdi_registry(registry);
3517        let oci_spec = builder
3518            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3519            .await
3520            .expect("build with MPS sharing");
3521
3522        let env = oci_spec
3523            .process()
3524            .as_ref()
3525            .and_then(|p| p.env().as_ref())
3526            .expect("env present");
3527        let pipe_expect = format!("CUDA_MPS_PIPE_DIRECTORY={}", pipe_dir.display());
3528        let log_expect = format!("CUDA_MPS_LOG_DIRECTORY={}", log_dir.display());
3529        assert!(
3530            env.iter().any(|e| e == &pipe_expect),
3531            "expected {pipe_expect} in env; got {env:?}"
3532        );
3533        assert!(
3534            env.iter().any(|e| e == &log_expect),
3535            "expected {log_expect} in env; got {env:?}"
3536        );
3537
3538        let mounts = oci_spec.mounts().as_ref().expect("mounts present");
3539        assert!(
3540            mounts
3541                .iter()
3542                .any(|m| m.destination() == &pipe_dir && m.source().as_ref() == Some(&pipe_dir)),
3543            "expected bind mount of MPS pipe dir {}; got destinations {:?}",
3544            pipe_dir.display(),
3545            mounts.iter().map(Mount::destination).collect::<Vec<_>>()
3546        );
3547        assert!(
3548            mounts
3549                .iter()
3550                .any(|m| m.destination() == &log_dir && m.source().as_ref() == Some(&log_dir)),
3551            "expected bind mount of MPS log dir {}",
3552            log_dir.display()
3553        );
3554    }
3555
3556    #[tokio::test]
3557    async fn gpu_spec_with_mps_sharing_fails_when_pipe_dir_missing() {
3558        let cdi_dir = tempfile::tempdir().unwrap();
3559        let registry = build_nvidia_cdi_registry(cdi_dir.path());
3560
3561        let id = ContainerId::new("test".to_string(), 1);
3562        let mut spec = mock_gpu_spec("nvidia", 1);
3563        let gpu = spec.resources.gpu.as_mut().expect("gpu spec set");
3564        gpu.sharing = Some(zlayer_spec::GpuSharingMode::Mps);
3565        // Path that demonstrably does not exist — tempdir() returns a unique
3566        // path so appending "definitely-not-here" gives a guaranteed miss.
3567        let missing = tempfile::tempdir().unwrap();
3568        let missing_path = missing.path().join("definitely-not-here");
3569        gpu.mps_pipe_dir = Some(missing_path.to_string_lossy().into_owned());
3570
3571        let builder =
3572            BundleBuilder::new("/tmp/test-bundle-mps-missing".into()).with_cdi_registry(registry);
3573        let err = builder
3574            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3575            .await
3576            .expect_err("should fail when MPS pipe dir is missing");
3577        match err {
3578            AgentError::GpuSharingUnavailable { mode, reason } => {
3579                assert_eq!(mode, "mps");
3580                assert!(
3581                    reason.contains("pipe") || reason.contains(&missing_path.display().to_string()),
3582                    "reason should mention the missing path; got: {reason}"
3583                );
3584            }
3585            other => panic!("expected GpuSharingUnavailable, got {other:?}"),
3586        }
3587    }
3588
3589    #[cfg(target_os = "linux")]
3590    #[tokio::test]
3591    async fn gpu_spec_with_timeslicing_injects_visible_devices() {
3592        let cdi_dir = tempfile::tempdir().unwrap();
3593        let registry = build_nvidia_cdi_registry(cdi_dir.path());
3594
3595        let id = ContainerId::new("test".to_string(), 1);
3596        let mut spec = mock_gpu_spec("nvidia", 1);
3597        let gpu = spec.resources.gpu.as_mut().expect("gpu spec set");
3598        gpu.sharing = Some(zlayer_spec::GpuSharingMode::TimeSlice);
3599        gpu.time_slice_index = Some(2);
3600
3601        let builder =
3602            BundleBuilder::new("/tmp/test-bundle-timeslice".into()).with_cdi_registry(registry);
3603        let oci_spec = builder
3604            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3605            .await
3606            .expect("build with time-slicing");
3607
3608        let env = oci_spec
3609            .process()
3610            .as_ref()
3611            .and_then(|p| p.env().as_ref())
3612            .expect("env present");
3613        // Time-slicing must clobber any earlier `CUDA_VISIBLE_DEVICES` (e.g.
3614        // the CDI-emitted full-device list) to advertise exactly the slice.
3615        let cuda_entries: Vec<&String> = env
3616            .iter()
3617            .filter(|e| e.starts_with("CUDA_VISIBLE_DEVICES="))
3618            .collect();
3619        assert_eq!(
3620            cuda_entries.len(),
3621            1,
3622            "exactly one CUDA_VISIBLE_DEVICES expected; got {cuda_entries:?}"
3623        );
3624        assert_eq!(cuda_entries[0], "CUDA_VISIBLE_DEVICES=2");
3625    }
3626
3627    #[cfg(target_os = "linux")]
3628    #[tokio::test]
3629    async fn gpu_spec_without_sharing_omits_mps_env() {
3630        let cdi_dir = tempfile::tempdir().unwrap();
3631        let registry = build_nvidia_cdi_registry(cdi_dir.path());
3632
3633        let id = ContainerId::new("test".to_string(), 1);
3634        let spec = mock_gpu_spec("nvidia", 1);
3635        assert!(spec.resources.gpu.as_ref().unwrap().sharing.is_none());
3636
3637        let builder =
3638            BundleBuilder::new("/tmp/test-bundle-no-sharing".into()).with_cdi_registry(registry);
3639        let oci_spec = builder
3640            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3641            .await
3642            .expect("build without sharing");
3643
3644        let env = oci_spec
3645            .process()
3646            .as_ref()
3647            .and_then(|p| p.env().as_ref())
3648            .expect("env present");
3649        assert!(
3650            !env.iter().any(|e| e.starts_with("CUDA_MPS_")),
3651            "no CUDA_MPS_* env should be present without sharing; got {env:?}"
3652        );
3653
3654        // No MPS mount should be added either. The 5.A CDI fixture mounts a
3655        // /dev/nvidia0 device but never bind-mounts /tmp/nvidia-mps; verify
3656        // we don't sneak that in.
3657        let mounts = oci_spec.mounts().as_ref().expect("mounts present");
3658        assert!(
3659            !mounts
3660                .iter()
3661                .any(|m| { m.destination().to_string_lossy().contains("nvidia-mps") }),
3662            "no MPS pipe mount should be present without sharing"
3663        );
3664    }
3665
3666    #[cfg(unix)]
3667    mod subid_tests {
3668        use super::super::read_subid_range;
3669        use std::io::Write;
3670
3671        #[test]
3672        fn read_subid_range_returns_range_for_user() {
3673            let mut tmp = tempfile::NamedTempFile::new().unwrap();
3674            writeln!(tmp, "alice:100000:65536").unwrap();
3675            writeln!(tmp, "bob:165536:65536").unwrap();
3676            tmp.flush().unwrap();
3677            let path = tmp.path().to_str().unwrap();
3678            assert_eq!(read_subid_range(path, "bob"), Some((165_536, 65_536)));
3679            assert_eq!(read_subid_range(path, "alice"), Some((100_000, 65_536)));
3680        }
3681
3682        #[test]
3683        fn read_subid_range_returns_none_for_unknown_user() {
3684            let mut tmp = tempfile::NamedTempFile::new().unwrap();
3685            writeln!(tmp, "alice:100000:65536").unwrap();
3686            tmp.flush().unwrap();
3687            assert_eq!(
3688                read_subid_range(tmp.path().to_str().unwrap(), "carol"),
3689                None
3690            );
3691        }
3692
3693        #[test]
3694        fn read_subid_range_returns_none_on_missing_file() {
3695            assert_eq!(
3696                read_subid_range("/this/path/does/not/exist/subuid", "anyone"),
3697                None
3698            );
3699        }
3700    }
3701}