Skip to main content

zlayer_agent/
bundle.rs

1//! OCI Bundle Creation
2//!
3//! Creates OCI-compliant bundles for container runtimes using libcontainer (youki).
4//! A bundle consists of a directory with:
5//! - config.json: OCI runtime specification
6//! - rootfs/: Container filesystem (symlink or bind mount target)
7
8use crate::cdi::{self, CdiContainerEdits, CdiRegistry};
9use crate::error::{AgentError, Result};
10use crate::runtime::ContainerId;
11use oci_spec::runtime::{
12    Capability, Hook, HookBuilder, Hooks, HooksBuilder, LinuxBuilder, LinuxCapabilitiesBuilder,
13    LinuxCpuBuilder, LinuxDeviceBuilder, LinuxDeviceCgroupBuilder, LinuxDeviceType,
14    LinuxMemoryBuilder, LinuxNamespaceBuilder, LinuxNamespaceType, LinuxResourcesBuilder, Mount,
15    MountBuilder, ProcessBuilder, RootBuilder, Spec, SpecBuilder, UserBuilder,
16};
17// `LinuxIdMappingBuilder` is only used by the unix-gated rootless user-namespace
18// helpers below; importing it unconditionally trips dead-code lints on Windows.
19#[cfg(unix)]
20use oci_spec::runtime::LinuxIdMappingBuilder;
21use std::collections::{HashMap, HashSet};
22// `MetadataExt` is only meaningful on Unix-like hosts where `/dev/*` nodes exist
23// and have major/minor numbers. On Windows this module is still built so that
24// `BundleBuilder::build_spec_only` (cross-platform OCI Spec generation) can be
25// called from the WSL2 delegate runtime, which then pipes the generated
26// `config.json` into a Linux WSL2 distro that owns the actual device
27// fingerprint. See G-1 / G-2 in the Windows plan. The import is performed
28// inside `get_device_major_minor` itself to avoid an unused-import warning on
29// non-Unix platforms.
30use std::path::{Path, PathBuf};
31use std::str::FromStr;
32use std::sync::Arc;
33use tokio::fs;
34use zlayer_secrets::SecretsProvider;
35use zlayer_spec::{GpuSharingMode, ServiceSpec, StorageSpec, StorageTier};
36
37/// Default host directory for the NVIDIA MPS control pipe when the spec
38/// doesn't override [`zlayer_spec::GpuSpec::mps_pipe_dir`].
39const DEFAULT_MPS_PIPE_DIR: &str = "/tmp/nvidia-mps";
40
41/// Default host directory for NVIDIA MPS log output when the spec doesn't
42/// override [`zlayer_spec::GpuSpec::mps_log_dir`].
43const DEFAULT_MPS_LOG_DIR: &str = "/tmp/nvidia-log";
44
45/// Container path where a host-supplied NVIDIA time-slicing config YAML is
46/// surfaced (read-only). The file is informational — `ZLayer` doesn't interpret
47/// it; tools running inside the container can read it to discover slice
48/// topology.
49const TIMESLICE_CONFIG_CONTAINER_PATH: &str = "/etc/nvidia/gpu-time-slicing.yaml";
50
51/// Resolved MPS host directories (pipe + log), validated to exist on disk.
52///
53/// Returned by [`resolve_mps_dirs`] only when `GpuSpec.sharing == Mps`. Both
54/// paths are absolute and guaranteed to be directories at the time the
55/// helper ran — callers can bind-mount them directly.
56struct MpsDirs {
57    pipe_dir: PathBuf,
58    log_dir: PathBuf,
59}
60
61/// Resolve and validate the MPS pipe / log directories for a GPU spec.
62///
63/// Returns `Ok(None)` when sharing is not MPS (or absent), `Ok(Some(...))`
64/// when both directories exist on the host, or
65/// [`AgentError::GpuSharingUnavailable`] when either directory is missing.
66///
67/// Defaults to [`DEFAULT_MPS_PIPE_DIR`] / [`DEFAULT_MPS_LOG_DIR`] when the
68/// spec omits explicit paths, matching the convention used by
69/// `nvidia-cuda-mps-control` out of the box.
70fn resolve_mps_dirs(gpu: &zlayer_spec::GpuSpec) -> Result<Option<MpsDirs>> {
71    if gpu.sharing != Some(GpuSharingMode::Mps) {
72        return Ok(None);
73    }
74
75    let pipe_dir = PathBuf::from(gpu.mps_pipe_dir.as_deref().unwrap_or(DEFAULT_MPS_PIPE_DIR));
76    let log_dir = PathBuf::from(gpu.mps_log_dir.as_deref().unwrap_or(DEFAULT_MPS_LOG_DIR));
77
78    if !pipe_dir.is_dir() {
79        return Err(AgentError::GpuSharingUnavailable {
80            mode: "mps".to_string(),
81            reason: format!(
82                "MPS pipe directory {} does not exist; ensure nvidia-cuda-mps-control is running",
83                pipe_dir.display()
84            ),
85        });
86    }
87    if !log_dir.is_dir() {
88        return Err(AgentError::GpuSharingUnavailable {
89            mode: "mps".to_string(),
90            reason: format!(
91                "MPS log directory {} does not exist; ensure nvidia-cuda-mps-control is running",
92                log_dir.display()
93            ),
94        });
95    }
96
97    Ok(Some(MpsDirs { pipe_dir, log_dir }))
98}
99
100/// Convert a CDI device node descriptor into the OCI [`LinuxDevice`] used by
101/// the runtime.
102///
103/// CDI device nodes may omit `type`, `major`, and `minor` — in that case we
104/// probe the host (via `get_device_type` / `get_device_major_minor`) using
105/// the resolved host path, falling back to character device with zero
106/// major/minor when the file is unavailable (typical for test fixtures
107/// that reference paths that don't exist on the build host).
108fn cdi_node_to_oci_device(
109    node: &crate::cdi::CdiDeviceNode,
110) -> Result<oci_spec::runtime::LinuxDevice> {
111    let host_path = node.host_path.as_deref().unwrap_or(&node.path);
112
113    let dev_type = match node.device_type.as_deref() {
114        Some("c" | "u") => LinuxDeviceType::C,
115        Some("b") => LinuxDeviceType::B,
116        Some("p") => LinuxDeviceType::P,
117        _ => get_device_type(host_path).unwrap_or(LinuxDeviceType::C),
118    };
119
120    let (major, minor) = if let (Some(maj), Some(min)) = (node.major, node.minor) {
121        (maj, min)
122    } else {
123        get_device_major_minor(host_path).unwrap_or((0, 0))
124    };
125
126    let mut builder = LinuxDeviceBuilder::default()
127        .path(node.path.clone())
128        .typ(dev_type)
129        .major(major)
130        .minor(minor);
131    if let Some(mode) = node.file_mode {
132        builder = builder.file_mode(mode);
133    } else {
134        builder = builder.file_mode(0o666u32);
135    }
136    builder = builder.uid(node.uid.unwrap_or(0));
137    builder = builder.gid(node.gid.unwrap_or(0));
138
139    builder.build().map_err(|e| {
140        AgentError::InvalidSpec(format!(
141            "failed to build CDI device {path}: {e}",
142            path = node.path
143        ))
144    })
145}
146
147/// Convert a CDI hook descriptor into the OCI [`Hook`] used by the runtime.
148fn convert_cdi_hook(cdi_hook: &crate::cdi::CdiHook) -> Result<Hook> {
149    let mut builder = HookBuilder::default().path(PathBuf::from(&cdi_hook.path));
150    if !cdi_hook.args.is_empty() {
151        builder = builder.args(cdi_hook.args.clone());
152    }
153    if !cdi_hook.env.is_empty() {
154        builder = builder.env(cdi_hook.env.clone());
155    }
156    builder
157        .build()
158        .map_err(|e| AgentError::InvalidSpec(format!("failed to build CDI hook: {e}")))
159}
160
161/// All Linux capabilities for privileged mode
162const ALL_CAPABILITIES: &[Capability] = &[
163    Capability::AuditControl,
164    Capability::AuditRead,
165    Capability::AuditWrite,
166    Capability::BlockSuspend,
167    Capability::Bpf,
168    Capability::CheckpointRestore,
169    Capability::Chown,
170    Capability::DacOverride,
171    Capability::DacReadSearch,
172    Capability::Fowner,
173    Capability::Fsetid,
174    Capability::IpcLock,
175    Capability::IpcOwner,
176    Capability::Kill,
177    Capability::Lease,
178    Capability::LinuxImmutable,
179    Capability::MacAdmin,
180    Capability::MacOverride,
181    Capability::Mknod,
182    Capability::NetAdmin,
183    Capability::NetBindService,
184    Capability::NetBroadcast,
185    Capability::NetRaw,
186    Capability::Perfmon,
187    Capability::Setfcap,
188    Capability::Setgid,
189    Capability::Setpcap,
190    Capability::Setuid,
191    Capability::SysAdmin,
192    Capability::SysBoot,
193    Capability::SysChroot,
194    Capability::SysModule,
195    Capability::SysNice,
196    Capability::SysPacct,
197    Capability::SysPtrace,
198    Capability::SysRawio,
199    Capability::SysResource,
200    Capability::SysTime,
201    Capability::SysTtyConfig,
202    Capability::Syslog,
203    Capability::WakeAlarm,
204];
205
206/// Parse memory string like "512Mi", "1Gi" to bytes
207///
208/// Supports both IEC (binary) and SI (decimal) units:
209/// - IEC: Ki, Mi, Gi, Ti (powers of 1024)
210/// - SI: K/k, M/m, G/g, T/t (powers of 1000)
211/// - No suffix: bytes
212///
213/// # Examples
214/// ```ignore
215/// assert_eq!(parse_memory_string("512Mi").unwrap(), 512 * 1024 * 1024);
216/// assert_eq!(parse_memory_string("1Gi").unwrap(), 1024 * 1024 * 1024);
217/// assert_eq!(parse_memory_string("2G").unwrap(), 2 * 1000 * 1000 * 1000);
218/// ```
219///
220/// Render the contents of an `/etc/resolv.conf` for the given resolver
221/// addresses.
222///
223/// One `nameserver <ip>` line per entry, in order, followed by a single
224/// `options edns0` line (enables EDNS(0) so larger UDP responses — e.g. the
225/// overlay resolver forwarding A/AAAA records — are not truncated). The output
226/// is deliberately minimal: no `search`/`domain` directives, which would
227/// otherwise be inherited from the (hijacked) host resolv.conf we are
228/// replacing.
229///
230/// This exists because youki/libcontainer performs NO resolv.conf handling of
231/// its own — without an explicit bind mount the container sees only whatever
232/// `/etc/resolv.conf` shipped in the image (often empty or absent). The caller
233/// writes this string into the bundle directory and bind-mounts it read-only at
234/// `/etc/resolv.conf`.
235#[must_use]
236pub fn generate_resolv_conf(nameservers: &[String]) -> String {
237    let mut out = String::new();
238    for ns in nameservers {
239        out.push_str("nameserver ");
240        out.push_str(ns);
241        out.push('\n');
242    }
243    out.push_str("options edns0\n");
244    out
245}
246
247/// # Errors
248/// Returns an error if the string cannot be parsed as a memory size.
249pub fn parse_memory_string(s: &str) -> std::result::Result<u64, String> {
250    let s = s.trim();
251    if s.is_empty() {
252        return Err("empty memory string".to_string());
253    }
254
255    let (num_str, multiplier) = if let Some(n) = s.strip_suffix("Ki") {
256        (n, 1024u64)
257    } else if let Some(n) = s.strip_suffix("Mi") {
258        (n, 1024u64 * 1024)
259    } else if let Some(n) = s.strip_suffix("Gi") {
260        (n, 1024u64 * 1024 * 1024)
261    } else if let Some(n) = s.strip_suffix("Ti") {
262        (n, 1024u64 * 1024 * 1024 * 1024)
263    } else if let Some(n) = s.strip_suffix('K').or_else(|| s.strip_suffix('k')) {
264        (n, 1000u64)
265    } else if let Some(n) = s.strip_suffix('M').or_else(|| s.strip_suffix('m')) {
266        (n, 1000u64 * 1000)
267    } else if let Some(n) = s.strip_suffix('G').or_else(|| s.strip_suffix('g')) {
268        (n, 1000u64 * 1000 * 1000)
269    } else if let Some(n) = s.strip_suffix('T').or_else(|| s.strip_suffix('t')) {
270        (n, 1000u64 * 1000 * 1000 * 1000)
271    } else {
272        (s, 1u64)
273    };
274
275    let num: u64 = num_str
276        .parse()
277        .map_err(|e| format!("invalid number: {e}"))?;
278
279    Ok(num * multiplier)
280}
281
282/// Get major and minor device numbers from a device path
283///
284/// Unix-only: relies on `MetadataExt::rdev()` which isn't available on Windows.
285/// When `bundle.rs` is compiled for a Windows host (for the WSL2 delegate's
286/// cross-platform `build_spec_only` path), device probing is skipped entirely —
287/// the Linux side of the delegate is responsible for its own device fingerprint.
288/// The non-Unix stub below returns `Unsupported` so the `if let Ok(..)` /
289/// `.unwrap_or(..)` call sites at the CDI / GPU passthrough paths skip cleanly.
290#[cfg(unix)]
291#[allow(clippy::cast_possible_wrap)]
292fn get_device_major_minor(path: &str) -> std::io::Result<(i64, i64)> {
293    use std::os::unix::fs::MetadataExt;
294    let metadata = std::fs::metadata(path)?;
295    let rdev = metadata.rdev();
296    // Major is upper 8 bits (after shifting), minor is lower 8 bits
297    let major = ((rdev >> 8) & 0xff) as i64;
298    let minor = (rdev & 0xff) as i64;
299    Ok((major, minor))
300}
301
302/// Non-Unix stub: device-cgroup probes require Unix; callers use `if let Ok(..)` to skip.
303#[cfg(not(unix))]
304fn get_device_major_minor(_path: &str) -> std::io::Result<(i64, i64)> {
305    Err(std::io::Error::new(
306        std::io::ErrorKind::Unsupported,
307        "device-cgroup probes require Unix",
308    ))
309}
310
311/// Detect device type from path
312///
313/// Unix-only: uses `FileTypeExt::is_char_device` / `is_block_device` which are
314/// not available on Windows. See `get_device_major_minor` for the rationale.
315#[cfg(unix)]
316fn get_device_type(path: &str) -> std::io::Result<LinuxDeviceType> {
317    use std::os::unix::fs::FileTypeExt;
318    let metadata = std::fs::metadata(path)?;
319    let file_type = metadata.file_type();
320    if file_type.is_char_device() {
321        Ok(LinuxDeviceType::C)
322    } else if file_type.is_block_device() {
323        Ok(LinuxDeviceType::B)
324    } else {
325        Ok(LinuxDeviceType::U) // Unknown/other
326    }
327}
328
329/// Non-Unix stub: device-cgroup probes require Unix; callers use `.unwrap_or(..)` to skip.
330#[cfg(not(unix))]
331fn get_device_type(_path: &str) -> std::io::Result<LinuxDeviceType> {
332    Err(std::io::Error::new(
333        std::io::ErrorKind::Unsupported,
334        "device-cgroup probes require Unix",
335    ))
336}
337
338/// Builder for OCI container bundles
339///
340/// Creates the directory structure and config.json required for OCI-compliant
341/// container runtimes like runc or youki.
342///
343/// # Example
344/// ```ignore
345/// let dirs = zlayer_paths::ZLayerDirs::system_default();
346/// let builder = BundleBuilder::new(dirs.bundles().join("mycontainer"))
347///     .with_rootfs(dirs.rootfs().join("myimage"));
348///
349/// let bundle_path = builder.build(&container_id, &service_spec).await?;
350/// ```
351#[derive(Clone)]
352pub struct BundleBuilder {
353    /// Base directory for the bundle
354    bundle_dir: PathBuf,
355    /// Path to the unpacked rootfs (from image layers)
356    rootfs_path: Option<PathBuf>,
357    /// Custom hostname (defaults to container ID)
358    hostname: Option<String>,
359    /// Additional environment variables
360    extra_env: Vec<(String, String)>,
361    /// Custom working directory
362    cwd: Option<String>,
363    /// Custom command/args to run (overrides image default)
364    args: Option<Vec<String>>,
365    /// Pre-resolved volume paths from `StorageManager`
366    volume_paths: HashMap<String, PathBuf>,
367    /// Image configuration from the OCI registry (entrypoint, cmd, env, workdir, user)
368    image_config: Option<zlayer_registry::ImageConfig>,
369    /// Use host networking (skip Network namespace, container shares host network)
370    host_network: bool,
371    /// Secrets provider for resolving $S: prefixed env vars
372    secrets_provider: Option<Arc<dyn SecretsProvider>>,
373    /// Deployment scope for secret lookups (e.g., deployment name)
374    deployment_scope: Option<String>,
375    /// Host-side Unix socket path to bind-mount into the container
376    socket_path: Option<String>,
377    /// Optional CDI registry override (defaults to discovery from system paths).
378    ///
379    /// Wrapped in `Arc` so [`BundleBuilder`] can stay [`Clone`]. Primarily set
380    /// in tests via [`BundleBuilder::with_cdi_registry`]; production paths
381    /// leave this `None` and lazy-discover via [`CdiRegistry::discover`] when
382    /// a `GpuSpec` is present.
383    cdi_registry: Option<Arc<CdiRegistry>>,
384}
385
386impl std::fmt::Debug for BundleBuilder {
387    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
388        f.debug_struct("BundleBuilder")
389            .field("bundle_dir", &self.bundle_dir)
390            .field("rootfs_path", &self.rootfs_path)
391            .field("hostname", &self.hostname)
392            .field("extra_env", &self.extra_env)
393            .field("cwd", &self.cwd)
394            .field("args", &self.args)
395            .field("volume_paths", &self.volume_paths)
396            .field("image_config", &self.image_config)
397            .field("host_network", &self.host_network)
398            .field("secrets_provider", &self.secrets_provider.is_some())
399            .field("deployment_scope", &self.deployment_scope)
400            .field("socket_path", &self.socket_path)
401            .field("cdi_registry", &self.cdi_registry.is_some())
402            .finish()
403    }
404}
405
406/// Build OCI `uid_mappings` (or `gid_mappings` — same structure) for a rootless
407/// container. Always emits a single-id mapping (container 0 → `host_id`, size 1).
408/// If `username` has an entry in `subid_path` (e.g. /etc/subuid), appends a
409/// range mapping (container 1 → range start, size = range count).
410///
411/// Rootless user-namespace mapping is a Linux/libcontainer concept; on Windows
412/// containers run via HCS so this helper is unix-only.
413#[cfg(unix)]
414fn build_rootless_id_mappings(
415    host_id: u32,
416    subid_path: &str,
417    username: &str,
418) -> Vec<oci_spec::runtime::LinuxIdMapping> {
419    let mut mappings = vec![LinuxIdMappingBuilder::default()
420        .container_id(0_u32)
421        .host_id(host_id)
422        .size(1_u32)
423        .build()
424        .unwrap()];
425    if !username.is_empty() {
426        if let Some((start, count)) = read_subid_range(subid_path, username) {
427            mappings.push(
428                LinuxIdMappingBuilder::default()
429                    .container_id(1_u32)
430                    .host_id(start)
431                    .size(count)
432                    .build()
433                    .unwrap(),
434            );
435        }
436    }
437    mappings
438}
439
440/// Read /etc/subuid (or /etc/subgid) and return the (start, count) range
441/// allocated to the given username, if any. Returns None on any I/O error
442/// or when the user has no entry — callers must fall back to a single-id
443/// mapping in that case.
444///
445/// Subuid files are a Linux concept and the only caller is the unix-gated
446/// `build_rootless_id_mappings`, so this helper is unix-only as well.
447#[cfg(unix)]
448fn read_subid_range(path: &str, username: &str) -> Option<(u32, u32)> {
449    let contents = std::fs::read_to_string(path).ok()?;
450    for line in contents.lines() {
451        let mut parts = line.splitn(3, ':');
452        let user = parts.next()?;
453        if user != username {
454            continue;
455        }
456        let start: u32 = parts.next()?.parse().ok()?;
457        let count: u32 = parts.next()?.parse().ok()?;
458        return Some((start, count));
459    }
460    None
461}
462
463impl BundleBuilder {
464    /// Create a new `BundleBuilder` with the specified bundle directory
465    ///
466    /// The bundle directory will be created if it doesn't exist.
467    /// The structure will be:
468    /// ```text
469    /// {bundle_dir}/
470    /// ├── config.json
471    /// └── rootfs/  (symlink to actual rootfs or mount point)
472    /// ```
473    #[must_use]
474    pub fn new(bundle_dir: PathBuf) -> Self {
475        Self {
476            bundle_dir,
477            rootfs_path: None,
478            hostname: None,
479            extra_env: Vec::new(),
480            cwd: None,
481            args: None,
482            volume_paths: HashMap::new(),
483            image_config: None,
484            host_network: false,
485            secrets_provider: None,
486            deployment_scope: None,
487            socket_path: None,
488            cdi_registry: None,
489        }
490    }
491
492    /// Override the CDI registry used for GPU device resolution.
493    ///
494    /// When unset, [`build_oci_spec`](Self::build_oci_spec) discovers CDI
495    /// specs lazily from the standard system search paths (`/etc/cdi`,
496    /// `/var/run/cdi`, plus `$CDI_SPEC_DIRS`). Tests use this setter to
497    /// inject fixture-backed registries pointed at a temp directory.
498    #[must_use]
499    pub fn with_cdi_registry(mut self, registry: Arc<CdiRegistry>) -> Self {
500        self.cdi_registry = Some(registry);
501        self
502    }
503
504    /// Create a `BundleBuilder` for a container in the default bundle location
505    #[must_use]
506    pub fn for_container(container_id: &ContainerId) -> Self {
507        let bundle_dir = zlayer_paths::ZLayerDirs::system_default()
508            .bundles()
509            .join(container_id.to_string());
510        Self::new(bundle_dir)
511    }
512
513    /// Set the rootfs path (from unpacked image layers)
514    ///
515    /// This path will be symlinked into the bundle as `rootfs/`
516    #[must_use]
517    pub fn with_rootfs(mut self, rootfs_path: PathBuf) -> Self {
518        self.rootfs_path = Some(rootfs_path);
519        self
520    }
521
522    /// Set a custom hostname for the container
523    #[must_use]
524    pub fn with_hostname(mut self, hostname: String) -> Self {
525        self.hostname = Some(hostname);
526        self
527    }
528
529    /// Add extra environment variables
530    #[must_use]
531    pub fn with_env(mut self, key: String, value: String) -> Self {
532        self.extra_env.push((key, value));
533        self
534    }
535
536    /// Set the working directory
537    #[must_use]
538    pub fn with_cwd(mut self, cwd: String) -> Self {
539        self.cwd = Some(cwd);
540        self
541    }
542
543    /// Set the command/args to run
544    #[must_use]
545    pub fn with_args(mut self, args: Vec<String>) -> Self {
546        self.args = Some(args);
547        self
548    }
549
550    /// Set pre-resolved volume paths from `StorageManager`
551    ///
552    /// These are used to map named/anonymous/S3 volumes to their host paths
553    /// when building storage mounts in the OCI spec.
554    #[must_use]
555    pub fn with_volume_paths(mut self, volume_paths: HashMap<String, PathBuf>) -> Self {
556        self.volume_paths = volume_paths;
557        self
558    }
559
560    /// Set the OCI image configuration (entrypoint, cmd, env, workdir, user)
561    ///
562    /// When set, the image config provides defaults for the container process
563    /// that are used when the deployment spec doesn't override them.
564    #[must_use]
565    pub fn with_image_config(mut self, config: zlayer_registry::ImageConfig) -> Self {
566        self.image_config = Some(config);
567        self
568    }
569
570    /// Enable host networking mode
571    ///
572    /// When true, the container will NOT get its own network namespace and will
573    /// share the host's network stack. This is equivalent to Docker's `--network host`.
574    /// Use this when overlay networking is unavailable or not desired.
575    #[must_use]
576    pub fn with_host_network(mut self, host_network: bool) -> Self {
577        self.host_network = host_network;
578        self
579    }
580
581    /// Set the secrets provider for resolving `$S:` prefixed environment variables
582    ///
583    /// When set, environment variables with `$S:secret-name` syntax will be resolved
584    /// from this provider at bundle creation time.
585    #[must_use]
586    pub fn with_secrets_provider(mut self, provider: Arc<dyn SecretsProvider>) -> Self {
587        self.secrets_provider = Some(provider);
588        self
589    }
590
591    /// Set the deployment scope for secret lookups
592    ///
593    /// This is typically the deployment name and is used as the scope when
594    /// resolving `$S:` prefixed environment variables.
595    #[must_use]
596    pub fn with_deployment_scope(mut self, scope: String) -> Self {
597        self.deployment_scope = Some(scope);
598        self
599    }
600
601    /// Set a host-side Unix socket path to bind-mount into the container at
602    /// the default `ZLayer` socket path (read-only).
603    #[must_use]
604    pub fn with_socket_mount(mut self, path: impl Into<String>) -> Self {
605        self.socket_path = Some(path.into());
606        self
607    }
608
609    /// Get the bundle directory path
610    #[must_use]
611    pub fn bundle_dir(&self) -> &Path {
612        &self.bundle_dir
613    }
614
615    /// Build the OCI bundle from a `ServiceSpec`
616    ///
617    /// Creates the bundle directory structure and generates config.json
618    /// based on the provided service specification.
619    ///
620    /// # Returns
621    /// The path to the bundle directory on success
622    ///
623    /// # Errors
624    /// - `AgentError::CreateFailed` if directory creation fails
625    /// - `AgentError::InvalidSpec` if the OCI spec generation fails
626    ///
627    /// # Platform
628    /// Unix-only. Uses `tokio::fs::symlink` which is defined in terms of
629    /// `std::os::unix::fs::symlink` and does not exist on Windows. The Windows
630    /// WSL2 delegate path should call [`BundleBuilder::build_spec_only`] to
631    /// obtain the OCI [`Spec`] and pipe it into the WSL2 distro, where the
632    /// Linux side of the delegate handles bundle directory creation.
633    #[cfg(unix)]
634    pub async fn build(&self, container_id: &ContainerId, spec: &ServiceSpec) -> Result<PathBuf> {
635        // Create bundle directory
636        fs::create_dir_all(&self.bundle_dir)
637            .await
638            .map_err(|e| AgentError::CreateFailed {
639                id: container_id.to_string(),
640                reason: format!("failed to create bundle directory: {e}"),
641            })?;
642
643        // Set up rootfs (symlink or create empty directory)
644        let rootfs_in_bundle = self.bundle_dir.join("rootfs");
645        if let Some(ref rootfs_path) = self.rootfs_path {
646            // Remove existing rootfs symlink/dir if present
647            let _ = fs::remove_file(&rootfs_in_bundle).await;
648            let _ = fs::remove_dir(&rootfs_in_bundle).await;
649
650            // Create symlink to actual rootfs.
651            // On Unix: `tokio::fs::symlink` (unified file/dir symlink).
652            // On Windows: `tokio::fs::symlink_dir` (wraps CreateSymbolicLinkW with
653            // SYMBOLIC_LINK_FLAG_DIRECTORY) — rootfs is always an OCI layer directory.
654            #[cfg(unix)]
655            tokio::fs::symlink(rootfs_path, &rootfs_in_bundle)
656                .await
657                .map_err(|e| AgentError::CreateFailed {
658                    id: container_id.to_string(),
659                    reason: format!(
660                        "failed to symlink rootfs from {} to {}: {}",
661                        rootfs_path.display(),
662                        rootfs_in_bundle.display(),
663                        e
664                    ),
665                })?;
666
667            #[cfg(windows)]
668            tokio::fs::symlink_dir(rootfs_path, &rootfs_in_bundle)
669                .await
670                .map_err(|e| AgentError::CreateFailed {
671                    id: container_id.to_string(),
672                    reason: format!(
673                        "failed to symlink rootfs from {} to {}: {}",
674                        rootfs_path.display(),
675                        rootfs_in_bundle.display(),
676                        e
677                    ),
678                })?;
679        } else {
680            // Create empty rootfs directory (for bind mounts)
681            fs::create_dir_all(&rootfs_in_bundle)
682                .await
683                .map_err(|e| AgentError::CreateFailed {
684                    id: container_id.to_string(),
685                    reason: format!("failed to create rootfs directory: {e}"),
686                })?;
687        }
688
689        // Generate OCI runtime spec
690        let oci_spec = self
691            .build_spec_only(container_id, spec, &self.volume_paths)
692            .await?;
693
694        // Write config.json
695        let config_path = self.bundle_dir.join("config.json");
696        let config_json =
697            serde_json::to_string_pretty(&oci_spec).map_err(|e| AgentError::CreateFailed {
698                id: container_id.to_string(),
699                reason: format!("failed to serialize OCI spec: {e}"),
700            })?;
701
702        fs::write(&config_path, config_json)
703            .await
704            .map_err(|e| AgentError::CreateFailed {
705                id: container_id.to_string(),
706                reason: format!("failed to write config.json: {e}"),
707            })?;
708
709        tracing::debug!(
710            "Created OCI bundle at {} for container {}",
711            self.bundle_dir.display(),
712            container_id
713        );
714
715        Ok(self.bundle_dir.clone())
716    }
717
718    /// Render the OCI runtime spec without creating a bundle directory
719    /// or writing `config.json`.
720    ///
721    /// This is the cross-platform entry point for OCI spec generation and is
722    /// the only bundle-builder method that is callable on Windows. Used by the
723    /// WSL2 delegate runtime (`runtimes/wsl2_delegate.rs`): the Windows host
724    /// renders the spec, then streams the JSON into the WSL distro filesystem
725    /// where `youki` will consume it. The bundle path passed to
726    /// `BundleBuilder::new` is purely informational in that flow; this method
727    /// never touches the filesystem.
728    ///
729    /// Unix hosts that want both the spec *and* the on-disk bundle layout
730    /// (rootfs symlink, `config.json`, parent directories) should continue to
731    /// use [`BundleBuilder::build`] or [`BundleBuilder::write_config`].
732    ///
733    /// # Errors
734    /// Returns [`AgentError::InvalidSpec`] if any of the OCI `*Builder` types
735    /// reject the configuration, or if environment-variable secret resolution
736    /// fails.
737    pub async fn build_spec_only(
738        &self,
739        container_id: &ContainerId,
740        spec: &ServiceSpec,
741        volume_paths: &std::collections::HashMap<String, PathBuf>,
742    ) -> Result<oci_spec::runtime::Spec> {
743        self.build_oci_spec(container_id, spec, volume_paths).await
744    }
745
746    /// Resolve CDI edits for a service spec's GPU request, if any.
747    ///
748    /// Returns:
749    /// - `Ok(None)` when the spec has no `GpuSpec`, when the vendor isn't a
750    ///   known CDI-published kind (e.g. `"apple"`), or when no explicit
751    ///   registry was set and lazy discovery turned up no installed specs
752    ///   (production fallback — baked-in defaults take over).
753    /// - `Ok(Some(vec))` with one entry per requested device when CDI specs
754    ///   are available and resolution succeeds.
755    /// - `Err(AgentError::InvalidSpec(...))` when the caller explicitly opted
756    ///   into CDI (via `with_cdi_registry`) but the resolution fails —
757    ///   surfaces [`cdi::CdiError::SpecMissing`] /
758    ///   [`cdi::CdiError::DeviceMissing`] / [`cdi::CdiError::NoDevices`] as
759    ///   actionable strings.
760    fn resolve_cdi_edits(&self, spec: &ServiceSpec) -> Result<Option<Vec<CdiContainerEdits>>> {
761        let Some(ref gpu) = spec.resources.gpu else {
762            return Ok(None);
763        };
764
765        // Map short vendor to CDI kind. Unknown vendors (e.g. "apple") fall
766        // back to baked-in behavior.
767        let Some(kind) = cdi::vendor_to_cdi_kind(&gpu.vendor) else {
768            return Ok(None);
769        };
770
771        // Decide registry source:
772        // - Explicit override: strict mode. Missing kind/device == hard error.
773        // - Lazy discover: opportunistic. Missing kind == silent fallback to
774        //   baked-in defaults so prod hosts without CDI installed keep
775        //   working.
776        let (registry, strict) = if let Some(reg) = &self.cdi_registry {
777            (reg.clone(), true)
778        } else {
779            let reg = Arc::new(CdiRegistry::discover());
780            if reg.is_empty() {
781                return Ok(None);
782            }
783            (reg, false)
784        };
785
786        let device_names: Vec<String> = (0..gpu.count).map(|i| i.to_string()).collect();
787
788        match registry.resolve_for_kind(kind, &device_names) {
789            Ok(edits) => Ok(Some(edits)),
790            Err(err) => {
791                if strict {
792                    Err(AgentError::InvalidSpec(format!(
793                        "CDI resolution failed for vendor '{}': {err}",
794                        gpu.vendor
795                    )))
796                } else {
797                    tracing::warn!(
798                        vendor = %gpu.vendor,
799                        kind = %kind,
800                        error = %err,
801                        "CDI resolution failed; falling back to baked-in GPU device passthrough"
802                    );
803                    Ok(None)
804                }
805            }
806        }
807    }
808
809    /// Build the OCI runtime spec from `ServiceSpec`.
810    ///
811    /// The full, CDI-aware implementation that backs both
812    /// [`BundleBuilder::build_spec_only`] (cross-platform, public) and the
813    /// Unix-only [`BundleBuilder::build`] / [`BundleBuilder::write_config`]
814    /// paths that additionally manage the bundle directory on disk.
815    ///
816    /// # Errors
817    /// Returns [`AgentError::InvalidSpec`] if any of the OCI `*Builder` types
818    /// reject the configuration, or if environment-variable secret resolution
819    /// fails.
820    ///
821    /// # Panics
822    /// Panics if the builder-internal `MountBuilder::build()` call fails for
823    /// the optional `ZLayer` API socket bind-mount. This is only reachable when
824    /// [`BundleBuilder::with_socket_mount`] has been used with a malformed
825    /// path, and is treated as a programmer error because all fields are
826    /// statically constructed from known-good inputs.
827    #[allow(clippy::too_many_lines)]
828    async fn build_oci_spec(
829        &self,
830        container_id: &ContainerId,
831        spec: &ServiceSpec,
832        volume_paths: &std::collections::HashMap<String, PathBuf>,
833    ) -> Result<Spec> {
834        // Resolve CDI edits up front. When present, these replace the
835        // baked-in vendor device-node / env injection below; when absent
836        // (no CDI installed, unknown vendor), the legacy code paths run.
837        let cdi_edits = self.resolve_cdi_edits(spec)?;
838
839        // Build user: image config user > root (spec doesn't currently have user override)
840        let user = {
841            let (uid, gid) = if let Some(user_str) = self
842                .image_config
843                .as_ref()
844                .and_then(|c| c.user.as_ref())
845                .filter(|u| !u.is_empty())
846            {
847                // Parse "uid:gid" or "uid" format from image config
848                let parts: Vec<&str> = user_str.splitn(2, ':').collect();
849                let uid = parts[0].parse::<u32>().unwrap_or(0);
850                let gid = if parts.len() > 1 {
851                    parts[1].parse::<u32>().unwrap_or(0)
852                } else {
853                    uid
854                };
855                (uid, gid)
856            } else {
857                (0u32, 0u32)
858            };
859
860            UserBuilder::default()
861                .uid(uid)
862                .gid(gid)
863                .build()
864                .map_err(|e| AgentError::InvalidSpec(format!("failed to build user: {e}")))?
865        };
866
867        // Build environment variables
868        // Layer: image config env (base) -> defaults -> spec env -> builder extra env
869        let mut env: Vec<String> = Vec::new();
870        let mut env_keys: HashSet<String> = HashSet::new();
871
872        // Seed with image config env first (lowest priority)
873        if let Some(img_env) = self.image_config.as_ref().and_then(|c| c.env.as_ref()) {
874            for entry in img_env {
875                if let Some(key) = entry.split('=').next() {
876                    env_keys.insert(key.to_string());
877                }
878                env.push(entry.clone());
879            }
880        }
881
882        // If image config didn't provide PATH, add the default
883        if !env_keys.contains("PATH") {
884            env.push(
885                "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(),
886            );
887            env_keys.insert("PATH".to_string());
888        }
889
890        // Add TERM for interactive compatibility (if not already set)
891        if !env_keys.contains("TERM") {
892            env.push("TERM=xterm".to_string());
893            env_keys.insert("TERM".to_string());
894        }
895
896        // Add service-specific env vars, resolving $S: and $E: prefixed references
897        // These override image config env for same keys
898        //
899        // When a secrets provider is available, use the full secrets-aware resolver
900        // that handles both $S: (secret) and $E: (env) prefixed values.
901        // Otherwise fall back to the env-only resolver.
902        if let (Some(secrets_provider), Some(scope)) =
903            (&self.secrets_provider, &self.deployment_scope)
904        {
905            let resolved_map =
906                crate::env::resolve_env_with_secrets(&spec.env, secrets_provider.as_ref(), scope)
907                    .await
908                    .map_err(|e| {
909                        AgentError::InvalidSpec(format!(
910                            "environment variable resolution failed: {e}"
911                        ))
912                    })?;
913
914            for (key, value) in &resolved_map {
915                if env_keys.contains(key.as_str()) {
916                    env.retain(|e| e.split('=').next() != Some(key.as_str()));
917                }
918                env_keys.insert(key.clone());
919                env.push(format!("{key}={value}"));
920            }
921        } else {
922            let resolved = crate::env::resolve_env_vars_with_warnings(&spec.env).map_err(|e| {
923                AgentError::InvalidSpec(format!("environment variable resolution failed: {e}"))
924            })?;
925
926            // Log any warnings about resolved env vars
927            for warning in &resolved.warnings {
928                tracing::warn!(container = %container_id, "{}", warning);
929            }
930
931            // Merge spec env: spec values take precedence over image config for same keys
932            for var in &resolved.vars {
933                if let Some(key) = var.split('=').next() {
934                    if env_keys.contains(key) {
935                        // Remove the old entry from image config
936                        env.retain(|e| e.split('=').next() != Some(key));
937                    }
938                    env_keys.insert(key.to_string());
939                }
940                env.push(var.clone());
941            }
942        }
943
944        // Add extra env vars from builder (highest priority)
945        for (key, value) in &self.extra_env {
946            if env_keys.contains(key.as_str()) {
947                env.retain(|e| e.split('=').next() != Some(key.as_str()));
948            }
949            env_keys.insert(key.clone());
950            env.push(format!("{key}={value}"));
951        }
952
953        // GPU device visibility environment variables.
954        //
955        // When CDI edits are available, the vendor-supplied spec is the
956        // source of truth (e.g. NVIDIA's `nvidia-ctk cdi generate` emits
957        // `NVIDIA_VISIBLE_DEVICES` plus driver-capability env on every
958        // device entry). Otherwise fall back to the historical baked-in
959        // strings so non-CDI hosts continue to advertise the right devices
960        // to CUDA/ROCm/oneAPI runtimes.
961        if let Some(ref edits_per_device) = cdi_edits {
962            for edits in edits_per_device {
963                for entry in &edits.env {
964                    if let Some(key) = entry.split('=').next() {
965                        if env_keys.contains(key) {
966                            env.retain(|e| e.split('=').next() != Some(key));
967                        }
968                        env_keys.insert(key.to_string());
969                    }
970                    env.push(entry.clone());
971                }
972            }
973        } else if let Some(ref gpu) = spec.resources.gpu {
974            // Default to 0..count when no explicit indices are provided
975            let indices: Vec<String> = (0..gpu.count).map(|i| i.to_string()).collect();
976            let device_list = indices.join(",");
977            match gpu.vendor.as_str() {
978                "nvidia" => {
979                    env.push(format!("NVIDIA_VISIBLE_DEVICES={device_list}"));
980                    env.push(format!("CUDA_VISIBLE_DEVICES={device_list}"));
981                }
982                "amd" => {
983                    env.push(format!("ROCR_VISIBLE_DEVICES={device_list}"));
984                    env.push(format!("HIP_VISIBLE_DEVICES={device_list}"));
985                }
986                "intel" => {
987                    env.push(format!("ZE_AFFINITY_MASK={device_list}"));
988                }
989                _ => {}
990            }
991        }
992
993        // GPU sharing (MPS / time-slicing) env injection.
994        //
995        // Layered on top of the CDI / baked-in `*_VISIBLE_DEVICES` block above:
996        // * MPS: validate host pipe/log dirs exist (error otherwise) and
997        //   export `CUDA_MPS_PIPE_DIRECTORY` / `CUDA_MPS_LOG_DIRECTORY`.
998        // * Time-slicing: override `CUDA_VISIBLE_DEVICES` to the configured
999        //   slice index so the workload sees a single virtualised GPU rather
1000        //   than the full 0..count list emitted above.
1001        //
1002        // The mount side (bind-mounting the MPS dirs / time-slicing config
1003        // file) is handled further down where the rest of the mounts get
1004        // assembled.
1005        let mps_dirs = if let Some(ref gpu) = spec.resources.gpu {
1006            resolve_mps_dirs(gpu)?
1007        } else {
1008            None
1009        };
1010        if let Some(ref dirs) = mps_dirs {
1011            let pipe = format!("CUDA_MPS_PIPE_DIRECTORY={}", dirs.pipe_dir.display());
1012            let log = format!("CUDA_MPS_LOG_DIRECTORY={}", dirs.log_dir.display());
1013            if env_keys.contains("CUDA_MPS_PIPE_DIRECTORY") {
1014                env.retain(|e| e.split('=').next() != Some("CUDA_MPS_PIPE_DIRECTORY"));
1015            }
1016            if env_keys.contains("CUDA_MPS_LOG_DIRECTORY") {
1017                env.retain(|e| e.split('=').next() != Some("CUDA_MPS_LOG_DIRECTORY"));
1018            }
1019            env_keys.insert("CUDA_MPS_PIPE_DIRECTORY".to_string());
1020            env_keys.insert("CUDA_MPS_LOG_DIRECTORY".to_string());
1021            env.push(pipe);
1022            env.push(log);
1023        }
1024        if let Some(ref gpu) = spec.resources.gpu {
1025            if gpu.sharing == Some(GpuSharingMode::TimeSlice) {
1026                if let Some(idx) = gpu.time_slice_index {
1027                    // Time-slicing virtualises a single physical GPU as N
1028                    // slices; the workload sees one device, addressed by
1029                    // its slice index. Override whatever the CDI / baked-in
1030                    // path emitted earlier.
1031                    env.retain(|e| e.split('=').next() != Some("CUDA_VISIBLE_DEVICES"));
1032                    env_keys.insert("CUDA_VISIBLE_DEVICES".to_string());
1033                    env.push(format!("CUDA_VISIBLE_DEVICES={idx}"));
1034                }
1035            }
1036        }
1037
1038        // Inject distributed training coordination env vars when configured.
1039        // MASTER_ADDR uses the service DNS name (resolved by the overlay DNS).
1040        // RANK defaults to 0 (overridden by the agent when placing specific replicas).
1041        if let Some(ref gpu) = spec.resources.gpu {
1042            if let Some(ref dist) = gpu.distributed {
1043                env.push(format!("MASTER_PORT={}", dist.master_port));
1044                env.push(format!("MASTER_ADDR={}", container_id.service));
1045                env.push("WORLD_SIZE=1".to_string());
1046                env.push("RANK=0".to_string());
1047                env.push("LOCAL_RANK=0".to_string());
1048                match dist.backend.as_str() {
1049                    "nccl" => env.push("NCCL_SOCKET_IFNAME=eth0".to_string()),
1050                    "gloo" => env.push("GLOO_SOCKET_IFNAME=eth0".to_string()),
1051                    _ => {}
1052                }
1053            }
1054        }
1055
1056        // Build capabilities
1057        let capabilities = self.build_capabilities(spec)?;
1058
1059        // Determine working directory: builder override > spec.command.workdir > image config > "/"
1060        let cwd = self
1061            .cwd
1062            .clone()
1063            .or_else(|| spec.command.workdir.clone())
1064            .or_else(|| {
1065                self.image_config
1066                    .as_ref()
1067                    .and_then(|c| c.working_dir.as_ref())
1068                    .filter(|w| !w.is_empty())
1069                    .cloned()
1070            })
1071            .unwrap_or_else(|| "/".to_string());
1072
1073        // Resolve process args: builder override > spec command > image config > /bin/sh
1074        let process_args = if let Some(ref args) = self.args {
1075            args.clone()
1076        } else {
1077            Self::resolve_command_from_spec(spec, self.image_config.as_ref())
1078        };
1079
1080        // Build process
1081        let mut process_builder = ProcessBuilder::default()
1082            .terminal(false)
1083            .user(user)
1084            .env(env)
1085            .args(process_args)
1086            .cwd(cwd)
1087            .no_new_privileges(!spec.privileged && spec.capabilities.is_empty());
1088
1089        // Set capabilities if we have them
1090        if let Some(caps) = capabilities {
1091            process_builder = process_builder.capabilities(caps);
1092        }
1093
1094        let process = process_builder
1095            .build()
1096            .map_err(|e| AgentError::InvalidSpec(format!("failed to build process: {e}")))?;
1097
1098        // Build root filesystem config
1099        // Note: "rootfs" is relative to the bundle directory per OCI spec
1100        let root = RootBuilder::default()
1101            .path("rootfs".to_string())
1102            .readonly(false)
1103            .build()
1104            .map_err(|e| AgentError::InvalidSpec(format!("failed to build root: {e}")))?;
1105
1106        // Build default mounts
1107        let mut mounts = self.build_default_mounts(spec)?;
1108
1109        // Add storage mounts from spec
1110        let storage_mounts = self.build_storage_mounts(spec, volume_paths)?;
1111        mounts.extend(storage_mounts);
1112
1113        // Add ZLayer API socket bind-mount if configured.
1114        // Use typ("bind") so libcontainer's mount code handles the source path
1115        // correctly for sockets (canonicalize + file-based mount point creation).
1116        if let Some(ref socket_path) = self.socket_path {
1117            mounts.push(
1118                MountBuilder::default()
1119                    .destination(zlayer_paths::ZLayerDirs::default_socket_path())
1120                    .typ("bind")
1121                    .source(socket_path.clone())
1122                    .options(vec!["rbind".into(), "ro".into()])
1123                    .build()
1124                    .expect("valid socket mount"),
1125            );
1126        }
1127
1128        // Container DNS resolver injection.
1129        //
1130        // youki/libcontainer does no resolv.conf handling on its own: the
1131        // container sees whatever `/etc/resolv.conf` the image shipped (often
1132        // empty/absent). When the spec carries explicit resolver addresses
1133        // (`spec.dns`, populated upstream in `ServiceManager` with the overlay
1134        // resolver's node-IP — the host's own resolv.conf is unusable because
1135        // the netbird `~.` systemd-resolved hijack swallows container queries),
1136        // we materialize a minimal resolv.conf alongside the bundle and
1137        // bind-mount it read-only at `/etc/resolv.conf`.
1138        //
1139        // The `resolv.conf` `nameserver` directive has no port syntax (always
1140        // port 53), which is exactly why the overlay DNS server must already be
1141        // bound on `<node_ip>:53` for this address to be useful.
1142        //
1143        // Host-network containers share the host's `/etc/resolv.conf` directly,
1144        // so we skip injection for them (matching the Docker runtime). On the
1145        // WSL2-on-Windows render path `build_spec_only` is called without an
1146        // on-disk bundle directory; the `bundle_dir.exists()` guard skips the
1147        // file write + mount there, preserving today's behavior.
1148        if !spec.host_network && !spec.dns.is_empty() && self.bundle_dir.exists() {
1149            let resolv_path = self.bundle_dir.join("resolv.conf");
1150            let contents = generate_resolv_conf(&spec.dns);
1151            fs::write(&resolv_path, contents).await.map_err(|e| {
1152                AgentError::InvalidSpec(format!(
1153                    "failed to write resolv.conf to bundle at {}: {e}",
1154                    resolv_path.display()
1155                ))
1156            })?;
1157            mounts.push(
1158                MountBuilder::default()
1159                    .destination("/etc/resolv.conf".to_string())
1160                    .typ("bind")
1161                    .source(resolv_path.to_string_lossy().to_string())
1162                    .options(vec!["rbind".to_string(), "ro".to_string()])
1163                    .build()
1164                    .map_err(|e| {
1165                        AgentError::InvalidSpec(format!("failed to build resolv.conf mount: {e}"))
1166                    })?,
1167            );
1168        }
1169
1170        // Append CDI-provided mounts (e.g. vendor driver libraries that the
1171        // GPU runtime needs to expose to the container).
1172        if let Some(ref edits_per_device) = cdi_edits {
1173            for edits in edits_per_device {
1174                for cdi_mount in &edits.mounts {
1175                    let mut opts = cdi_mount.options.clone();
1176                    if !opts.iter().any(|o| o == "bind" || o == "rbind") {
1177                        opts.push("rbind".to_string());
1178                    }
1179                    mounts.push(
1180                        MountBuilder::default()
1181                            .destination(cdi_mount.container_path.clone())
1182                            .typ("bind")
1183                            .source(cdi_mount.host_path.clone())
1184                            .options(opts)
1185                            .build()
1186                            .map_err(|e| {
1187                                AgentError::InvalidSpec(format!("failed to build CDI mount: {e}"))
1188                            })?,
1189                    );
1190                }
1191            }
1192        }
1193
1194        // GPU sharing mounts.
1195        //
1196        // MPS: bind-mount the host pipe / log directories into the container
1197        // at the same path so the in-container CUDA runtime can talk to the
1198        // MPS daemon over its UNIX socket and append to the shared log.
1199        // The env vars (`CUDA_MPS_PIPE_DIRECTORY` / `CUDA_MPS_LOG_DIRECTORY`)
1200        // are exported earlier in the env-assembly block.
1201        //
1202        // Time-slicing: optionally surface the host's slicing config YAML at
1203        // a well-known read-only path so introspection tools inside the
1204        // container can read it.
1205        if let Some(ref dirs) = mps_dirs {
1206            mounts.push(
1207                MountBuilder::default()
1208                    .destination(dirs.pipe_dir.clone())
1209                    .typ("bind")
1210                    .source(dirs.pipe_dir.clone())
1211                    .options(vec!["rbind".into(), "rw".into()])
1212                    .build()
1213                    .map_err(|e| {
1214                        AgentError::InvalidSpec(format!("failed to build MPS pipe mount: {e}"))
1215                    })?,
1216            );
1217            mounts.push(
1218                MountBuilder::default()
1219                    .destination(dirs.log_dir.clone())
1220                    .typ("bind")
1221                    .source(dirs.log_dir.clone())
1222                    .options(vec!["rbind".into(), "rw".into()])
1223                    .build()
1224                    .map_err(|e| {
1225                        AgentError::InvalidSpec(format!("failed to build MPS log mount: {e}"))
1226                    })?,
1227            );
1228        }
1229        if let Some(ref gpu) = spec.resources.gpu {
1230            if gpu.sharing == Some(GpuSharingMode::TimeSlice) {
1231                if let Some(ref cfg_path) = gpu.time_slicing_config_path {
1232                    let host = PathBuf::from(cfg_path);
1233                    if !host.is_file() {
1234                        return Err(AgentError::GpuSharingUnavailable {
1235                            mode: "time-slice".to_string(),
1236                            reason: format!(
1237                                "time-slicing config {} is not a regular file on the host",
1238                                host.display()
1239                            ),
1240                        });
1241                    }
1242                    mounts.push(
1243                        MountBuilder::default()
1244                            .destination(PathBuf::from(TIMESLICE_CONFIG_CONTAINER_PATH))
1245                            .typ("bind")
1246                            .source(host)
1247                            .options(vec!["rbind".into(), "ro".into()])
1248                            .build()
1249                            .map_err(|e| {
1250                                AgentError::InvalidSpec(format!(
1251                                    "failed to build time-slicing config mount: {e}"
1252                                ))
1253                            })?,
1254                    );
1255                }
1256            }
1257        }
1258
1259        // Build Linux-specific config
1260        let linux = self.build_linux_config(container_id, spec, cdi_edits.as_deref())?;
1261
1262        // Determine hostname
1263        let hostname = self
1264            .hostname
1265            .clone()
1266            .unwrap_or_else(|| container_id.to_string());
1267
1268        // Build the complete spec, attaching any CDI-provided hooks.
1269        let mut spec_builder = SpecBuilder::default()
1270            .version("1.0.2".to_string())
1271            .root(root)
1272            .process(process)
1273            .hostname(hostname)
1274            .mounts(mounts)
1275            .linux(linux);
1276
1277        if let Some(ref edits_per_device) = cdi_edits {
1278            if let Some(hooks) = Self::build_hooks_from_cdi(edits_per_device)? {
1279                spec_builder = spec_builder.hooks(hooks);
1280            }
1281        }
1282
1283        let oci_spec = spec_builder
1284            .build()
1285            .map_err(|e| AgentError::InvalidSpec(format!("failed to build OCI spec: {e}")))?;
1286
1287        Ok(oci_spec)
1288    }
1289
1290    /// Convert the union of CDI hooks across all resolved devices into an
1291    /// OCI [`Hooks`] block.
1292    ///
1293    /// Returns `Ok(None)` when no device contributed hooks (so the spec
1294    /// builder skips the empty block — `oci-spec` treats `null` as "no
1295    /// hooks" while serializers may emit empty arrays otherwise).
1296    fn build_hooks_from_cdi(edits_per_device: &[CdiContainerEdits]) -> Result<Option<Hooks>> {
1297        let mut prestart: Vec<Hook> = Vec::new();
1298        let mut create_runtime: Vec<Hook> = Vec::new();
1299        let mut create_container: Vec<Hook> = Vec::new();
1300        let mut start_container: Vec<Hook> = Vec::new();
1301        let mut poststart: Vec<Hook> = Vec::new();
1302        let mut poststop: Vec<Hook> = Vec::new();
1303
1304        for edits in edits_per_device {
1305            let Some(ref h) = edits.hooks else { continue };
1306            for hook in &h.prestart {
1307                prestart.push(convert_cdi_hook(hook)?);
1308            }
1309            for hook in &h.create_runtime {
1310                create_runtime.push(convert_cdi_hook(hook)?);
1311            }
1312            for hook in &h.create_container {
1313                create_container.push(convert_cdi_hook(hook)?);
1314            }
1315            for hook in &h.start_container {
1316                start_container.push(convert_cdi_hook(hook)?);
1317            }
1318            for hook in &h.poststart {
1319                poststart.push(convert_cdi_hook(hook)?);
1320            }
1321            for hook in &h.poststop {
1322                poststop.push(convert_cdi_hook(hook)?);
1323            }
1324        }
1325
1326        if prestart.is_empty()
1327            && create_runtime.is_empty()
1328            && create_container.is_empty()
1329            && start_container.is_empty()
1330            && poststart.is_empty()
1331            && poststop.is_empty()
1332        {
1333            return Ok(None);
1334        }
1335
1336        let mut builder = HooksBuilder::default();
1337        if !prestart.is_empty() {
1338            #[allow(deprecated)]
1339            {
1340                builder = builder.prestart(prestart);
1341            }
1342        }
1343        if !create_runtime.is_empty() {
1344            builder = builder.create_runtime(create_runtime);
1345        }
1346        if !create_container.is_empty() {
1347            builder = builder.create_container(create_container);
1348        }
1349        if !start_container.is_empty() {
1350            builder = builder.start_container(start_container);
1351        }
1352        if !poststart.is_empty() {
1353            builder = builder.poststart(poststart);
1354        }
1355        if !poststop.is_empty() {
1356            builder = builder.poststop(poststop);
1357        }
1358
1359        let hooks = builder
1360            .build()
1361            .map_err(|e| AgentError::InvalidSpec(format!("failed to build CDI hooks: {e}")))?;
1362        Ok(Some(hooks))
1363    }
1364
1365    /// Build Linux capabilities configuration
1366    #[allow(clippy::unused_self)]
1367    fn build_capabilities(
1368        &self,
1369        spec: &ServiceSpec,
1370    ) -> Result<Option<oci_spec::runtime::LinuxCapabilities>> {
1371        if spec.privileged {
1372            // Privileged mode: all capabilities
1373            let all_caps: HashSet<Capability> = ALL_CAPABILITIES.iter().copied().collect();
1374            let empty_caps: HashSet<Capability> = HashSet::new();
1375
1376            let caps = LinuxCapabilitiesBuilder::default()
1377                .bounding(all_caps.clone())
1378                .effective(all_caps.clone())
1379                .permitted(all_caps)
1380                .inheritable(empty_caps.clone())
1381                .ambient(empty_caps)
1382                .build()
1383                .map_err(|e| {
1384                    AgentError::InvalidSpec(format!("failed to build capabilities: {e}"))
1385                })?;
1386
1387            Ok(Some(caps))
1388        } else if !spec.capabilities.is_empty() {
1389            // Specific capabilities requested
1390            let caps: HashSet<Capability> = spec
1391                .capabilities
1392                .iter()
1393                .filter_map(|c| {
1394                    // Normalize capability name (add CAP_ prefix if missing, uppercase)
1395                    let cap_name = if c.starts_with("CAP_") {
1396                        c.to_uppercase()
1397                    } else {
1398                        format!("CAP_{}", c.to_uppercase())
1399                    };
1400                    Capability::from_str(&cap_name).ok()
1401                })
1402                .collect();
1403
1404            let empty_caps: HashSet<Capability> = HashSet::new();
1405
1406            let built_caps = LinuxCapabilitiesBuilder::default()
1407                .bounding(caps.clone())
1408                .effective(caps.clone())
1409                .permitted(caps)
1410                .inheritable(empty_caps.clone())
1411                .ambient(empty_caps)
1412                .build()
1413                .map_err(|e| {
1414                    AgentError::InvalidSpec(format!("failed to build capabilities: {e}"))
1415                })?;
1416
1417            Ok(Some(built_caps))
1418        } else {
1419            // Default: minimal capabilities for basic container operation
1420            let default_caps: HashSet<Capability> = [
1421                Capability::Chown,
1422                Capability::DacOverride,
1423                Capability::Fsetid,
1424                Capability::Fowner,
1425                Capability::Mknod,
1426                Capability::NetRaw,
1427                Capability::Setgid,
1428                Capability::Setuid,
1429                Capability::Setfcap,
1430                Capability::Setpcap,
1431                Capability::NetBindService,
1432                Capability::SysChroot,
1433                Capability::Kill,
1434                Capability::AuditWrite,
1435            ]
1436            .into_iter()
1437            .collect();
1438
1439            let empty_caps: HashSet<Capability> = HashSet::new();
1440
1441            let built_caps = LinuxCapabilitiesBuilder::default()
1442                .bounding(default_caps.clone())
1443                .effective(default_caps.clone())
1444                .permitted(default_caps)
1445                .inheritable(empty_caps.clone())
1446                .ambient(empty_caps)
1447                .build()
1448                .map_err(|e| {
1449                    AgentError::InvalidSpec(format!("failed to build capabilities: {e}"))
1450                })?;
1451
1452            Ok(Some(built_caps))
1453        }
1454    }
1455
1456    /// Build default filesystem mounts for the container
1457    #[allow(clippy::unused_self, clippy::too_many_lines)]
1458    fn build_default_mounts(&self, spec: &ServiceSpec) -> Result<Vec<Mount>> {
1459        let mut mounts = Vec::new();
1460
1461        // /proc
1462        mounts.push(
1463            MountBuilder::default()
1464                .destination("/proc".to_string())
1465                .typ("proc".to_string())
1466                .source("proc".to_string())
1467                .options(vec![
1468                    "nosuid".to_string(),
1469                    "noexec".to_string(),
1470                    "nodev".to_string(),
1471                ])
1472                .build()
1473                .map_err(|e| {
1474                    AgentError::InvalidSpec(format!("failed to build /proc mount: {e}"))
1475                })?,
1476        );
1477
1478        // /dev
1479        mounts.push(
1480            MountBuilder::default()
1481                .destination("/dev".to_string())
1482                .typ("tmpfs".to_string())
1483                .source("tmpfs".to_string())
1484                .options(vec![
1485                    "nosuid".to_string(),
1486                    "strictatime".to_string(),
1487                    "mode=755".to_string(),
1488                    "size=65536k".to_string(),
1489                ])
1490                .build()
1491                .map_err(|e| AgentError::InvalidSpec(format!("failed to build /dev mount: {e}")))?,
1492        );
1493
1494        // /dev/pts
1495        mounts.push(
1496            MountBuilder::default()
1497                .destination("/dev/pts".to_string())
1498                .typ("devpts".to_string())
1499                .source("devpts".to_string())
1500                .options(vec![
1501                    "nosuid".to_string(),
1502                    "noexec".to_string(),
1503                    "newinstance".to_string(),
1504                    "ptmxmode=0666".to_string(),
1505                    "mode=0620".to_string(),
1506                    "gid=5".to_string(),
1507                ])
1508                .build()
1509                .map_err(|e| {
1510                    AgentError::InvalidSpec(format!("failed to build /dev/pts mount: {e}"))
1511                })?,
1512        );
1513
1514        // /dev/shm
1515        mounts.push(
1516            MountBuilder::default()
1517                .destination("/dev/shm".to_string())
1518                .typ("tmpfs".to_string())
1519                .source("shm".to_string())
1520                .options(vec![
1521                    "nosuid".to_string(),
1522                    "noexec".to_string(),
1523                    "nodev".to_string(),
1524                    "mode=1777".to_string(),
1525                    "size=65536k".to_string(),
1526                ])
1527                .build()
1528                .map_err(|e| {
1529                    AgentError::InvalidSpec(format!("failed to build /dev/shm mount: {e}"))
1530                })?,
1531        );
1532
1533        // /dev/mqueue
1534        mounts.push(
1535            MountBuilder::default()
1536                .destination("/dev/mqueue".to_string())
1537                .typ("mqueue".to_string())
1538                .source("mqueue".to_string())
1539                .options(vec![
1540                    "nosuid".to_string(),
1541                    "noexec".to_string(),
1542                    "nodev".to_string(),
1543                ])
1544                .build()
1545                .map_err(|e| {
1546                    AgentError::InvalidSpec(format!("failed to build /dev/mqueue mount: {e}"))
1547                })?,
1548        );
1549
1550        // /sys - read-only unless privileged
1551        let sys_options = if spec.privileged {
1552            vec![
1553                "nosuid".to_string(),
1554                "noexec".to_string(),
1555                "nodev".to_string(),
1556            ]
1557        } else {
1558            vec![
1559                "nosuid".to_string(),
1560                "noexec".to_string(),
1561                "nodev".to_string(),
1562                "ro".to_string(),
1563            ]
1564        };
1565
1566        mounts.push(
1567            MountBuilder::default()
1568                .destination("/sys".to_string())
1569                .typ("sysfs".to_string())
1570                .source("sysfs".to_string())
1571                .options(sys_options)
1572                .build()
1573                .map_err(|e| AgentError::InvalidSpec(format!("failed to build /sys mount: {e}")))?,
1574        );
1575
1576        // /sys/fs/cgroup - for cgroup access
1577        mounts.push(
1578            MountBuilder::default()
1579                .destination("/sys/fs/cgroup".to_string())
1580                .typ("cgroup2".to_string())
1581                .source("cgroup".to_string())
1582                .options(vec![
1583                    "nosuid".to_string(),
1584                    "noexec".to_string(),
1585                    "nodev".to_string(),
1586                    "relatime".to_string(),
1587                ])
1588                .build()
1589                .map_err(|e| {
1590                    AgentError::InvalidSpec(format!("failed to build cgroup mount: {e}"))
1591                })?,
1592        );
1593
1594        Ok(mounts)
1595    }
1596
1597    /// Build storage mounts from `ServiceSpec` storage entries
1598    ///
1599    /// Converts `StorageSpec` entries to OCI Mount entries.
1600    /// Note: Named and Anonymous volumes require `StorageManager` to prepare paths.
1601    /// S3 volumes require s3fs FUSE mount (handled separately).
1602    #[allow(clippy::unused_self, clippy::too_many_lines)]
1603    fn build_storage_mounts(
1604        &self,
1605        spec: &ServiceSpec,
1606        volume_paths: &std::collections::HashMap<String, PathBuf>,
1607    ) -> Result<Vec<Mount>> {
1608        let mut mounts = Vec::new();
1609
1610        for storage in &spec.storage {
1611            let mount = match storage {
1612                StorageSpec::Bind {
1613                    source,
1614                    target,
1615                    readonly,
1616                } => {
1617                    let mut options = vec!["rbind".to_string()];
1618                    if *readonly {
1619                        options.push("ro".to_string());
1620                    } else {
1621                        options.push("rw".to_string());
1622                    }
1623
1624                    MountBuilder::default()
1625                        .destination(target.clone())
1626                        .typ("none".to_string())
1627                        .source(source.clone())
1628                        .options(options)
1629                        .build()
1630                        .map_err(|e| {
1631                            AgentError::InvalidSpec(format!(
1632                                "failed to build bind mount for {target}: {e}"
1633                            ))
1634                        })?
1635                }
1636
1637                StorageSpec::Named {
1638                    name,
1639                    target,
1640                    readonly,
1641                    tier,
1642                    ..
1643                } => {
1644                    // Get the prepared volume path from StorageManager
1645                    let source = volume_paths.get(name).ok_or_else(|| {
1646                        AgentError::InvalidSpec(format!(
1647                            "volume '{name}' not prepared - ensure StorageManager.ensure_volume() was called"
1648                        ))
1649                    })?;
1650
1651                    // Warn about SQLite safety for non-local tiers
1652                    if matches!(tier, StorageTier::Network) {
1653                        tracing::warn!(
1654                            volume = %name,
1655                            tier = ?tier,
1656                            "Network storage tier is NOT SQLite-safe. Avoid using SQLite databases on this volume."
1657                        );
1658                    }
1659
1660                    let mut options = vec!["rbind".to_string()];
1661                    if *readonly {
1662                        options.push("ro".to_string());
1663                    } else {
1664                        options.push("rw".to_string());
1665                    }
1666
1667                    MountBuilder::default()
1668                        .destination(target.clone())
1669                        .typ("none".to_string())
1670                        .source(source.to_string_lossy().to_string())
1671                        .options(options)
1672                        .build()
1673                        .map_err(|e| {
1674                            AgentError::InvalidSpec(format!(
1675                                "failed to build named volume mount for {target}: {e}"
1676                            ))
1677                        })?
1678                }
1679
1680                StorageSpec::Anonymous { target, tier } => {
1681                    // Anonymous volumes should have been created by StorageManager
1682                    // and the path passed in volume_paths with key "_anon_{target}"
1683                    let key = format!("_anon_{}", target.trim_start_matches('/').replace('/', "_"));
1684                    let source = volume_paths.get(&key).ok_or_else(|| {
1685                        AgentError::InvalidSpec(format!(
1686                            "anonymous volume for '{target}' not prepared"
1687                        ))
1688                    })?;
1689
1690                    if matches!(tier, StorageTier::Network) {
1691                        tracing::warn!(
1692                            target = %target,
1693                            tier = ?tier,
1694                            "Network storage tier is NOT SQLite-safe."
1695                        );
1696                    }
1697
1698                    let options = vec!["rbind".to_string(), "rw".to_string()];
1699
1700                    MountBuilder::default()
1701                        .destination(target.clone())
1702                        .typ("none".to_string())
1703                        .source(source.to_string_lossy().to_string())
1704                        .options(options)
1705                        .build()
1706                        .map_err(|e| {
1707                            AgentError::InvalidSpec(format!(
1708                                "failed to build anonymous volume mount for {target}: {e}"
1709                            ))
1710                        })?
1711                }
1712
1713                StorageSpec::Tmpfs { target, size, mode } => {
1714                    let mut options = vec!["nosuid".to_string(), "nodev".to_string()];
1715
1716                    if let Some(size_str) = size {
1717                        options.push(format!("size={size_str}"));
1718                    }
1719
1720                    if let Some(mode_val) = mode {
1721                        options.push(format!("mode={mode_val:o}"));
1722                    }
1723
1724                    MountBuilder::default()
1725                        .destination(target.clone())
1726                        .typ("tmpfs".to_string())
1727                        .source("tmpfs".to_string())
1728                        .options(options)
1729                        .build()
1730                        .map_err(|e| {
1731                            AgentError::InvalidSpec(format!(
1732                                "failed to build tmpfs mount for {target}: {e}"
1733                            ))
1734                        })?
1735                }
1736
1737                StorageSpec::S3 {
1738                    bucket,
1739                    prefix,
1740                    target,
1741                    readonly,
1742                    endpoint: _,
1743                    credentials: _,
1744                } => {
1745                    // S3 mounts are handled via s3fs FUSE
1746                    // The StorageManager should have mounted the bucket and passed the path
1747                    let key = format!("_s3_{}_{}", bucket, prefix.as_deref().unwrap_or(""));
1748                    let source = volume_paths.get(&key).ok_or_else(|| {
1749                        AgentError::InvalidSpec(format!(
1750                            "S3 volume for bucket '{bucket}' not mounted - ensure StorageManager.mount_s3() was called"
1751                        ))
1752                    })?;
1753
1754                    tracing::warn!(
1755                        bucket = %bucket,
1756                        target = %target,
1757                        "S3 storage is NOT SQLite-safe. Use for read-heavy workloads only."
1758                    );
1759
1760                    let mut options = vec!["rbind".to_string()];
1761                    if *readonly {
1762                        options.push("ro".to_string());
1763                    } else {
1764                        options.push("rw".to_string());
1765                    }
1766
1767                    MountBuilder::default()
1768                        .destination(target.clone())
1769                        .typ("none".to_string())
1770                        .source(source.to_string_lossy().to_string())
1771                        .options(options)
1772                        .build()
1773                        .map_err(|e| {
1774                            AgentError::InvalidSpec(format!(
1775                                "failed to build S3 mount for {target}: {e}"
1776                            ))
1777                        })?
1778                }
1779            };
1780
1781            mounts.push(mount);
1782        }
1783
1784        Ok(mounts)
1785    }
1786
1787    /// Build Linux-specific configuration
1788    #[allow(clippy::similar_names)] // euid/egid are POSIX-standard paired names
1789    #[allow(clippy::too_many_lines)]
1790    fn build_linux_config(
1791        &self,
1792        container_id: &ContainerId,
1793        spec: &ServiceSpec,
1794        cdi_edits: Option<&[CdiContainerEdits]>,
1795    ) -> Result<oci_spec::runtime::Linux> {
1796        // Build namespaces
1797        let mut namespaces = vec![
1798            LinuxNamespaceBuilder::default()
1799                .typ(LinuxNamespaceType::Pid)
1800                .build()
1801                .unwrap(),
1802            LinuxNamespaceBuilder::default()
1803                .typ(LinuxNamespaceType::Ipc)
1804                .build()
1805                .unwrap(),
1806            LinuxNamespaceBuilder::default()
1807                .typ(LinuxNamespaceType::Uts)
1808                .build()
1809                .unwrap(),
1810            LinuxNamespaceBuilder::default()
1811                .typ(LinuxNamespaceType::Mount)
1812                .build()
1813                .unwrap(),
1814        ];
1815
1816        // Only add Network namespace when NOT using host networking.
1817        // In host networking mode, the container shares the host's network stack
1818        // (like Docker's --network host).
1819        if !self.host_network {
1820            namespaces.push(
1821                LinuxNamespaceBuilder::default()
1822                    .typ(LinuxNamespaceType::Network)
1823                    .build()
1824                    .unwrap(),
1825            );
1826        }
1827
1828        // `nix::unistd` is unix-only. On non-unix targets (Windows), libcontainer
1829        // is not the runtime path (HCS is) and this function is effectively dead
1830        // code — so we statically force `rootless = false` there and skip the
1831        // user-namespace mapping block entirely.
1832        #[cfg(unix)]
1833        let rootless = !nix::unistd::geteuid().is_root();
1834        #[cfg(not(unix))]
1835        let rootless = false;
1836
1837        if rootless {
1838            namespaces.push(
1839                LinuxNamespaceBuilder::default()
1840                    .typ(LinuxNamespaceType::User)
1841                    .build()
1842                    .unwrap(),
1843            );
1844            namespaces.push(
1845                LinuxNamespaceBuilder::default()
1846                    .typ(LinuxNamespaceType::Cgroup)
1847                    .build()
1848                    .unwrap(),
1849            );
1850        }
1851
1852        let mut linux_builder = LinuxBuilder::default().namespaces(namespaces);
1853
1854        #[cfg(unix)]
1855        if rootless {
1856            let euid = nix::unistd::geteuid();
1857            let egid = nix::unistd::getegid();
1858            let username = nix::unistd::User::from_uid(euid)
1859                .ok()
1860                .flatten()
1861                .map(|u| u.name)
1862                .unwrap_or_default();
1863            linux_builder = linux_builder
1864                .uid_mappings(build_rootless_id_mappings(
1865                    euid.as_raw(),
1866                    "/etc/subuid",
1867                    &username,
1868                ))
1869                .gid_mappings(build_rootless_id_mappings(
1870                    egid.as_raw(),
1871                    "/etc/subgid",
1872                    &username,
1873                ));
1874        }
1875
1876        // Build resources (CPU, memory, devices)
1877        let resources = self.build_resources(spec)?;
1878        if let Some(resources) = resources {
1879            linux_builder = linux_builder.resources(resources);
1880        }
1881
1882        // Build device entries for passthrough.
1883        //
1884        // When CDI edits are present, the vendor-supplied device-node list
1885        // replaces our baked-in vendor-specific defaults — CDI knows the
1886        // host's exact device geometry (which majors/minors map to which
1887        // GPUs) so we trust it over our static `/dev/nvidiaN` enumeration.
1888        let mut devices = self.build_devices(spec, None, cdi_edits.is_some())?;
1889        if let Some(edits_per_device) = cdi_edits {
1890            for edits in edits_per_device {
1891                for node in &edits.device_nodes {
1892                    devices.push(cdi_node_to_oci_device(node)?);
1893                }
1894            }
1895        }
1896        if !devices.is_empty() {
1897            linux_builder = linux_builder.devices(devices);
1898        }
1899
1900        // Set rootfs propagation (matches Docker default)
1901        linux_builder = linux_builder.rootfs_propagation("private".to_string());
1902
1903        // Set masked/readonly paths based on privileged mode
1904        if spec.privileged {
1905            // Privileged containers get no masked paths (full access)
1906            linux_builder = linux_builder.masked_paths(vec![]).readonly_paths(vec![]);
1907        } else {
1908            // Set masked paths for security (hide sensitive host info)
1909            let masked_paths = vec![
1910                "/proc/acpi".to_string(),
1911                "/proc/asound".to_string(),
1912                "/proc/kcore".to_string(),
1913                "/proc/keys".to_string(),
1914                "/proc/latency_stats".to_string(),
1915                "/proc/timer_list".to_string(),
1916                "/proc/timer_stats".to_string(),
1917                "/proc/sched_debug".to_string(),
1918                "/proc/scsi".to_string(),
1919                "/sys/firmware".to_string(),
1920            ];
1921
1922            // Set readonly paths for security
1923            let readonly_paths = vec![
1924                "/proc/bus".to_string(),
1925                "/proc/fs".to_string(),
1926                "/proc/irq".to_string(),
1927                "/proc/sys".to_string(),
1928                "/proc/sysrq-trigger".to_string(),
1929            ];
1930
1931            linux_builder = linux_builder
1932                .masked_paths(masked_paths)
1933                .readonly_paths(readonly_paths);
1934        }
1935
1936        // Determine cgroups_path so libcontainer creates the container cgroup
1937        // under the current process's cgroup rather than at the v2 root. This
1938        // is required when running inside another container (e.g. Forgejo CI
1939        // `container:` block) where `/sys/fs/cgroup/cgroup.subtree_control` is
1940        // read-only. Precedence:
1941        //   1. spec.cgroup_parent (per-service override)         — all platforms
1942        //   2. ZLAYER_CGROUP_PARENT env var (host-wide override) — all platforms
1943        //   3. /proc/self/cgroup (auto-detect when nested)       — Linux only
1944        //   4. unset (default — bare-metal happy path; also the WSL2-delegate
1945        //      case on non-Linux hosts, where libcontainer inside the WSL
1946        //      distro resolves the parent at `zlayer runtime create` time)
1947        let cid = container_id.to_string();
1948
1949        // Explicit overrides are honored on every platform: a user might pin a
1950        // cgroup_parent for a WSL-delegate-bound spec even when this process
1951        // is running on Windows.
1952        let explicit_parent: Option<(String, &'static str)> =
1953            if let Some(p) = spec.cgroup_parent.as_deref().filter(|s| !s.is_empty()) {
1954                Some((p.to_string(), "spec"))
1955            } else if let Some(p) = std::env::var("ZLAYER_CGROUP_PARENT")
1956                .ok()
1957                .filter(|s| !s.is_empty())
1958            {
1959                Some((p, "env"))
1960            } else {
1961                None
1962            };
1963
1964        // Auto-detect (and the "no writable parent" hard error below) are
1965        // Linux-only: they inspect /proc/self/cgroup and /sys/fs/cgroup, which
1966        // don't exist on Windows hosts. When the bundle is destined for the
1967        // WSL2 delegate, cgroup-parent resolution happens inside the distro
1968        // at `zlayer runtime create` time, not here on the host.
1969        #[cfg(target_os = "linux")]
1970        let auto_parent: Option<(String, &'static str)> =
1971            if let Some(p) = crate::capability::ensure_daemon_leaf_and_container_parent() {
1972                Some((p, "auto-init"))
1973            } else if let Some(p) = crate::capability::current_cgroup_v2_path() {
1974                // Fallback: migration failed (likely cgroup root is read-only); use the
1975                // raw scope path. Pre-fix behaviour — surfaces the original error.
1976                Some((p, "auto"))
1977            } else {
1978                None
1979            };
1980        #[cfg(not(target_os = "linux"))]
1981        let auto_parent: Option<(String, &'static str)> = None;
1982
1983        let (cgroup_parent_value, cgroup_parent_source): (Option<String>, &'static str) =
1984            explicit_parent
1985                .or(auto_parent)
1986                .map_or((None, "none"), |(p, s)| (Some(p), s));
1987
1988        // Diagnostic guard rail: capability survey says we're nested, but we
1989        // couldn't resolve a cgroup parent here. This combination should not
1990        // normally happen because both code paths consult the same
1991        // `current_cgroup_v2_path()` helper. Surface it so an operator can
1992        // investigate; do not fail container creation. Linux-only — the
1993        // capability survey is itself a no-op on non-Linux.
1994        #[cfg(target_os = "linux")]
1995        if cgroup_parent_value.is_none() && crate::capability::DaemonCapabilities::get().is_nested {
1996            tracing::warn!(
1997                container_id = %cid,
1998                "capability survey reports nested daemon but cgroup_parent could not be resolved — proceeding with v2 root"
1999            );
2000        }
2001
2002        if let Some(parent) = cgroup_parent_value {
2003            let parent = parent.trim_end_matches('/');
2004            let full = format!("{parent}/{cid}");
2005            match cgroup_parent_source {
2006                "spec" => tracing::info!(
2007                    container_id = %cid,
2008                    source = "spec",
2009                    path = %full,
2010                    "cgroup_parent selected"
2011                ),
2012                "env" => tracing::info!(
2013                    container_id = %cid,
2014                    source = "env",
2015                    path = %full,
2016                    "cgroup_parent selected"
2017                ),
2018                "auto" => tracing::info!(
2019                    container_id = %cid,
2020                    source = "auto",
2021                    path = %full,
2022                    "cgroup_parent selected (from /proc/self/cgroup)"
2023                ),
2024                "auto-init" => tracing::info!(
2025                    container_id = %cid,
2026                    source = "auto-init",
2027                    path = %full,
2028                    "cgroup_parent selected (migrated daemon to <scope>/init; containers go under <scope>/containers)"
2029                ),
2030                _ => unreachable!(),
2031            }
2032            linux_builder = linux_builder.cgroups_path(std::path::PathBuf::from(full));
2033        } else {
2034            // Auto-detect found nothing AND no explicit override. Behaviour
2035            // differs by platform:
2036            //   - Linux: this is a real error in nested-container envs where
2037            //     the cgroup root is read-only. Emit the hard error so an
2038            //     operator fixes the env.
2039            //   - Non-Linux (Windows host building a bundle for the WSL2
2040            //     delegate): expected path; cgroup setup happens inside the
2041            //     distro at runtime-create time.
2042            #[cfg(target_os = "linux")]
2043            {
2044                let caps = crate::capability::DaemonCapabilities::get();
2045                if !caps.can_write_cgroup_root {
2046                    return Err(AgentError::InvalidSpec(format!(
2047                        "cannot create container {cid}: no writable cgroup parent. \
2048                         /proc/self/cgroup reports the cgroup-v2 root, and \
2049                         /sys/fs/cgroup is read-only to this process. Fix one of: \
2050                         (a) run the daemon's outer container with --cgroupns=host \
2051                         so /proc/self/cgroup reports a real parent; \
2052                         (b) set ZLAYER_CGROUP_PARENT=/path/to/writable/cgroup; \
2053                         (c) grant the daemon write access to /sys/fs/cgroup."
2054                    )));
2055                }
2056                tracing::info!(
2057                    container_id = %cid,
2058                    "cgroup_parent unset — libcontainer will use v2 root (cgroup root is writable here)"
2059                );
2060            }
2061            #[cfg(not(target_os = "linux"))]
2062            tracing::debug!(
2063                container_id = %cid,
2064                "non-Linux host — cgroup_parent unset; libcontainer inside the WSL distro will resolve a parent from its cgroup-v2 root"
2065            );
2066        }
2067
2068        linux_builder
2069            .build()
2070            .map_err(|e| AgentError::InvalidSpec(format!("failed to build linux config: {e}")))
2071    }
2072
2073    /// Build resource limits (CPU, memory, device cgroups)
2074    #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
2075    fn build_resources(
2076        &self,
2077        spec: &ServiceSpec,
2078    ) -> Result<Option<oci_spec::runtime::LinuxResources>> {
2079        let mut resources_builder = LinuxResourcesBuilder::default();
2080        let mut has_resources = false;
2081
2082        // CPU limits
2083        if let Some(cpu_limit) = spec.resources.cpu {
2084            // Convert CPU cores to microseconds quota
2085            // 100000 microseconds = 1 core's worth of time per period
2086            let quota = (cpu_limit * 100_000.0) as i64;
2087            let cpu = LinuxCpuBuilder::default()
2088                .quota(quota)
2089                .period(100_000u64)
2090                .build()
2091                .map_err(|e| AgentError::InvalidSpec(format!("failed to build CPU limits: {e}")))?;
2092
2093            resources_builder = resources_builder.cpu(cpu);
2094            has_resources = true;
2095        }
2096
2097        // Memory limits
2098        if let Some(ref memory_str) = spec.resources.memory {
2099            let bytes = parse_memory_string(memory_str)
2100                .map_err(|e| AgentError::InvalidSpec(format!("invalid memory limit: {e}")))?;
2101
2102            let memory = LinuxMemoryBuilder::default()
2103                .limit(bytes as i64)
2104                .build()
2105                .map_err(|e| {
2106                    AgentError::InvalidSpec(format!("failed to build memory limits: {e}"))
2107                })?;
2108
2109            resources_builder = resources_builder.memory(memory);
2110            has_resources = true;
2111        }
2112
2113        // Device cgroup rules
2114        let device_rules = self.build_device_cgroup_rules(spec, None)?;
2115        if !device_rules.is_empty() {
2116            resources_builder = resources_builder.devices(device_rules);
2117            has_resources = true;
2118        }
2119
2120        if has_resources {
2121            let resources = resources_builder
2122                .build()
2123                .map_err(|e| AgentError::InvalidSpec(format!("failed to build resources: {e}")))?;
2124            Ok(Some(resources))
2125        } else {
2126            Ok(None)
2127        }
2128    }
2129
2130    /// Build device cgroup rules
2131    #[allow(clippy::unused_self, clippy::too_many_lines)]
2132    fn build_device_cgroup_rules(
2133        &self,
2134        spec: &ServiceSpec,
2135        _gpu_indices: Option<&[u32]>,
2136    ) -> Result<Vec<oci_spec::runtime::LinuxDeviceCgroup>> {
2137        let mut rules = Vec::new();
2138
2139        if spec.privileged {
2140            // Privileged mode: allow all devices
2141            let rule = LinuxDeviceCgroupBuilder::default()
2142                .allow(true)
2143                .access("rwm".to_string())
2144                .build()
2145                .map_err(|e| {
2146                    AgentError::InvalidSpec(format!("failed to build device cgroup rule: {e}"))
2147                })?;
2148            rules.push(rule);
2149        } else {
2150            // Default: deny all, then allow specific devices
2151            let deny_all = LinuxDeviceCgroupBuilder::default()
2152                .allow(false)
2153                .access("rwm".to_string())
2154                .build()
2155                .map_err(|e| AgentError::InvalidSpec(format!("failed to build deny rule: {e}")))?;
2156            rules.push(deny_all);
2157
2158            // Allow standard container devices
2159            // /dev/null, /dev/zero, /dev/full, /dev/random, /dev/urandom, /dev/tty
2160            let standard_char_devices = [
2161                (1, 3, "rwm"),    // /dev/null
2162                (1, 5, "rwm"),    // /dev/zero
2163                (1, 7, "rwm"),    // /dev/full
2164                (1, 8, "rwm"),    // /dev/random
2165                (1, 9, "rwm"),    // /dev/urandom
2166                (5, 0, "rwm"),    // /dev/tty
2167                (5, 1, "rwm"),    // /dev/console
2168                (5, 2, "rwm"),    // /dev/ptmx
2169                (136, -1, "rwm"), // /dev/pts/* (wildcard minor)
2170            ];
2171
2172            for (major, minor, access) in standard_char_devices {
2173                let mut builder = LinuxDeviceCgroupBuilder::default()
2174                    .allow(true)
2175                    .typ(LinuxDeviceType::C)
2176                    .major(i64::from(major))
2177                    .access(access.to_string());
2178
2179                if minor >= 0 {
2180                    builder = builder.minor(i64::from(minor));
2181                }
2182
2183                let rule = builder.build().map_err(|e| {
2184                    AgentError::InvalidSpec(format!("failed to build char device rule: {e}"))
2185                })?;
2186                rules.push(rule);
2187            }
2188
2189            // Allow specific devices from spec (Unix-only: requires /dev/* fs
2190            // probing via `MetadataExt::rdev`). On Windows the WSL2 delegate
2191            // path regenerates these inside the Linux distro, so we skip here.
2192            #[cfg(unix)]
2193            for device in &spec.devices {
2194                if let Ok((major, minor)) = get_device_major_minor(&device.path) {
2195                    let dev_type = get_device_type(&device.path).unwrap_or(LinuxDeviceType::C);
2196
2197                    // Build access string
2198                    let mut access = String::new();
2199                    if device.read {
2200                        access.push('r');
2201                    }
2202                    if device.write {
2203                        access.push('w');
2204                    }
2205                    if device.mknod {
2206                        access.push('m');
2207                    }
2208                    if access.is_empty() {
2209                        access = "rw".to_string();
2210                    }
2211
2212                    let rule = LinuxDeviceCgroupBuilder::default()
2213                        .allow(true)
2214                        .typ(dev_type)
2215                        .major(major)
2216                        .minor(minor)
2217                        .access(access)
2218                        .build()
2219                        .map_err(|e| {
2220                            AgentError::InvalidSpec(format!(
2221                                "failed to build device rule for {}: {}",
2222                                device.path, e
2223                            ))
2224                        })?;
2225                    rules.push(rule);
2226                } else {
2227                    tracing::warn!("Failed to get device info for {}, skipping", device.path);
2228                }
2229            }
2230
2231            // Auto-allow GPU devices in cgroup when gpu spec is set
2232            if let Some(ref gpu) = spec.resources.gpu {
2233                match gpu.vendor.as_str() {
2234                    "nvidia" => {
2235                        // Allow all nvidia devices (major 195 for nvidia GPUs)
2236                        let rule = LinuxDeviceCgroupBuilder::default()
2237                            .allow(true)
2238                            .typ(LinuxDeviceType::C)
2239                            .major(195i64)
2240                            .access("rwm".to_string())
2241                            .build()
2242                            .map_err(|e| {
2243                                AgentError::InvalidSpec(format!(
2244                                    "failed to build GPU cgroup rule: {e}"
2245                                ))
2246                            })?;
2247                        rules.push(rule);
2248
2249                        // nvidia-uvm (major 510 or check dynamically)
2250                        let uvm_rule = LinuxDeviceCgroupBuilder::default()
2251                            .allow(true)
2252                            .typ(LinuxDeviceType::C)
2253                            .major(510i64)
2254                            .access("rwm".to_string())
2255                            .build()
2256                            .map_err(|e| {
2257                                AgentError::InvalidSpec(format!(
2258                                    "failed to build GPU UVM cgroup rule: {e}"
2259                                ))
2260                            })?;
2261                        rules.push(uvm_rule);
2262                    }
2263                    "amd" => {
2264                        // AMD ROCm: /dev/dri/renderD* and /dev/dri/card* (major 226)
2265                        let dri_rule = LinuxDeviceCgroupBuilder::default()
2266                            .allow(true)
2267                            .typ(LinuxDeviceType::C)
2268                            .major(226i64)
2269                            .access("rwm".to_string())
2270                            .build()
2271                            .map_err(|e| {
2272                                AgentError::InvalidSpec(format!(
2273                                    "failed to build AMD DRI cgroup rule: {e}"
2274                                ))
2275                            })?;
2276                        rules.push(dri_rule);
2277
2278                        // /dev/kfd - AMD Kernel Fusion Driver for compute (major 234)
2279                        let kfd_rule = LinuxDeviceCgroupBuilder::default()
2280                            .allow(true)
2281                            .typ(LinuxDeviceType::C)
2282                            .major(234i64)
2283                            .access("rwm".to_string())
2284                            .build()
2285                            .map_err(|e| {
2286                                AgentError::InvalidSpec(format!(
2287                                    "failed to build AMD KFD cgroup rule: {e}"
2288                                ))
2289                            })?;
2290                        rules.push(kfd_rule);
2291                    }
2292                    "intel" => {
2293                        // Intel GPU: /dev/dri/renderD* and /dev/dri/card* (major 226)
2294                        let dri_rule = LinuxDeviceCgroupBuilder::default()
2295                            .allow(true)
2296                            .typ(LinuxDeviceType::C)
2297                            .major(226i64)
2298                            .access("rwm".to_string())
2299                            .build()
2300                            .map_err(|e| {
2301                                AgentError::InvalidSpec(format!(
2302                                    "failed to build Intel DRI cgroup rule: {e}"
2303                                ))
2304                            })?;
2305                        rules.push(dri_rule);
2306                    }
2307                    other => {
2308                        // Unknown vendor - allow DRI devices as a reasonable default
2309                        tracing::warn!(
2310                            vendor = %other,
2311                            "Unknown GPU vendor, allowing DRI devices (major 226)"
2312                        );
2313                        let dri_rule = LinuxDeviceCgroupBuilder::default()
2314                            .allow(true)
2315                            .typ(LinuxDeviceType::C)
2316                            .major(226i64)
2317                            .access("rwm".to_string())
2318                            .build()
2319                            .map_err(|e| {
2320                                AgentError::InvalidSpec(format!(
2321                                    "failed to build GPU DRI cgroup rule: {e}"
2322                                ))
2323                            })?;
2324                        rules.push(dri_rule);
2325                    }
2326                }
2327            }
2328        }
2329
2330        Ok(rules)
2331    }
2332
2333    /// Build Linux device entries for passthrough
2334    ///
2335    /// # Platform
2336    /// Every branch below walks `/dev/*` on the host to resolve major/minor
2337    /// numbers via `MetadataExt::rdev`. On Windows (where this module is
2338    /// compiled only to feed the WSL2 delegate's cross-platform spec path) we
2339    /// skip device discovery and return an empty list — the Linux side of the
2340    /// delegate re-runs this step inside the WSL2 distro.
2341    #[allow(clippy::unused_self, clippy::too_many_lines)]
2342    #[cfg_attr(not(unix), allow(clippy::unnecessary_wraps, clippy::needless_return))]
2343    fn build_devices(
2344        &self,
2345        spec: &ServiceSpec,
2346        gpu_indices: Option<&[u32]>,
2347        skip_gpu_defaults: bool,
2348    ) -> Result<Vec<oci_spec::runtime::LinuxDevice>> {
2349        #[cfg(not(unix))]
2350        {
2351            let _ = (spec, gpu_indices, skip_gpu_defaults);
2352            return Ok(Vec::new());
2353        }
2354
2355        #[cfg(unix)]
2356        {
2357            let mut devices = Vec::new();
2358
2359            for device in &spec.devices {
2360                if let Ok((major, minor)) = get_device_major_minor(&device.path) {
2361                    let dev_type = get_device_type(&device.path).unwrap_or(LinuxDeviceType::C);
2362
2363                    let linux_device = LinuxDeviceBuilder::default()
2364                        .path(device.path.clone())
2365                        .typ(dev_type)
2366                        .major(major)
2367                        .minor(minor)
2368                        .file_mode(0o666u32)
2369                        .uid(0u32)
2370                        .gid(0u32)
2371                        .build()
2372                        .map_err(|e| {
2373                            AgentError::InvalidSpec(format!(
2374                                "failed to build device {}: {}",
2375                                device.path, e
2376                            ))
2377                        })?;
2378
2379                    devices.push(linux_device);
2380                }
2381            }
2382
2383            // When CDI is providing GPU device descriptors the caller will
2384            // append the vendor-supplied entries; skip our hard-coded
2385            // `/dev/nvidiaN` enumeration so we don't end up with both sources
2386            // of truth.
2387            if skip_gpu_defaults {
2388                return Ok(devices);
2389            }
2390
2391            // Auto-inject GPU devices when gpu spec is set
2392            if let Some(ref gpu) = spec.resources.gpu {
2393                let indices: Vec<u32> =
2394                    gpu_indices.map_or_else(|| (0..gpu.count).collect(), <[u32]>::to_vec);
2395
2396                match gpu.vendor.as_str() {
2397                    "nvidia" => {
2398                        // Always needed: nvidiactl, nvidia-uvm, nvidia-uvm-tools
2399                        let always_devices =
2400                            ["/dev/nvidiactl", "/dev/nvidia-uvm", "/dev/nvidia-uvm-tools"];
2401                        for dev_path in &always_devices {
2402                            if let Ok((major, minor)) = get_device_major_minor(dev_path) {
2403                                let dev_type =
2404                                    get_device_type(dev_path).unwrap_or(LinuxDeviceType::C);
2405                                let linux_device = LinuxDeviceBuilder::default()
2406                                    .path((*dev_path).to_string())
2407                                    .typ(dev_type)
2408                                    .major(major)
2409                                    .minor(minor)
2410                                    .file_mode(0o666u32)
2411                                    .uid(0u32)
2412                                    .gid(0u32)
2413                                    .build()
2414                                    .map_err(|e| {
2415                                        AgentError::InvalidSpec(format!(
2416                                            "failed to build GPU device {dev_path}: {e}"
2417                                        ))
2418                                    })?;
2419                                devices.push(linux_device);
2420                            } else {
2421                                tracing::warn!(
2422                                    "GPU device {} not found on host, skipping",
2423                                    dev_path
2424                                );
2425                            }
2426                        }
2427
2428                        // Per-GPU devices: /dev/nvidia0, /dev/nvidia1, etc.
2429                        for i in &indices {
2430                            let dev_path = format!("/dev/nvidia{i}");
2431                            if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2432                                let dev_type =
2433                                    get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2434                                let linux_device = LinuxDeviceBuilder::default()
2435                                    .path(dev_path.clone())
2436                                    .typ(dev_type)
2437                                    .major(major)
2438                                    .minor(minor)
2439                                    .file_mode(0o666u32)
2440                                    .uid(0u32)
2441                                    .gid(0u32)
2442                                    .build()
2443                                    .map_err(|e| {
2444                                        AgentError::InvalidSpec(format!(
2445                                            "failed to build GPU device {dev_path}: {e}"
2446                                        ))
2447                                    })?;
2448                                devices.push(linux_device);
2449                            } else {
2450                                tracing::warn!(
2451                                    "GPU device {} not found on host, skipping",
2452                                    dev_path
2453                                );
2454                            }
2455                        }
2456                    }
2457                    "amd" => {
2458                        // AMD ROCm: /dev/kfd is always required for compute
2459                        let amd_always_devices = ["/dev/kfd"];
2460                        for dev_path in &amd_always_devices {
2461                            if let Ok((major, minor)) = get_device_major_minor(dev_path) {
2462                                let dev_type =
2463                                    get_device_type(dev_path).unwrap_or(LinuxDeviceType::C);
2464                                let linux_device = LinuxDeviceBuilder::default()
2465                                    .path((*dev_path).to_string())
2466                                    .typ(dev_type)
2467                                    .major(major)
2468                                    .minor(minor)
2469                                    .file_mode(0o666u32)
2470                                    .uid(0u32)
2471                                    .gid(0u32)
2472                                    .build()
2473                                    .map_err(|e| {
2474                                        AgentError::InvalidSpec(format!(
2475                                            "failed to build GPU device {dev_path}: {e}"
2476                                        ))
2477                                    })?;
2478                                devices.push(linux_device);
2479                            } else {
2480                                tracing::warn!(
2481                                    "GPU device {} not found on host, skipping",
2482                                    dev_path
2483                                );
2484                            }
2485                        }
2486
2487                        // DRI render nodes: /dev/dri/renderD128, renderD129, etc.
2488                        for i in &indices {
2489                            let dev_path = format!("/dev/dri/renderD{}", 128 + i);
2490                            if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2491                                let dev_type =
2492                                    get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2493                                let linux_device = LinuxDeviceBuilder::default()
2494                                    .path(dev_path.clone())
2495                                    .typ(dev_type)
2496                                    .major(major)
2497                                    .minor(minor)
2498                                    .file_mode(0o666u32)
2499                                    .uid(0u32)
2500                                    .gid(0u32)
2501                                    .build()
2502                                    .map_err(|e| {
2503                                        AgentError::InvalidSpec(format!(
2504                                            "failed to build GPU device {dev_path}: {e}"
2505                                        ))
2506                                    })?;
2507                                devices.push(linux_device);
2508                            } else {
2509                                tracing::warn!(
2510                                    "GPU device {} not found on host, skipping",
2511                                    dev_path
2512                                );
2513                            }
2514                        }
2515
2516                        // DRI card nodes: /dev/dri/card0, card1, etc.
2517                        for i in &indices {
2518                            let dev_path = format!("/dev/dri/card{i}");
2519                            if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2520                                let dev_type =
2521                                    get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2522                                let linux_device = LinuxDeviceBuilder::default()
2523                                    .path(dev_path.clone())
2524                                    .typ(dev_type)
2525                                    .major(major)
2526                                    .minor(minor)
2527                                    .file_mode(0o666u32)
2528                                    .uid(0u32)
2529                                    .gid(0u32)
2530                                    .build()
2531                                    .map_err(|e| {
2532                                        AgentError::InvalidSpec(format!(
2533                                            "failed to build GPU device {dev_path}: {e}"
2534                                        ))
2535                                    })?;
2536                                devices.push(linux_device);
2537                            } else {
2538                                tracing::warn!(
2539                                    "GPU device {} not found on host, skipping",
2540                                    dev_path
2541                                );
2542                            }
2543                        }
2544                    }
2545                    "intel" => {
2546                        // Intel GPU: DRI render nodes /dev/dri/renderD128, etc.
2547                        for i in &indices {
2548                            let dev_path = format!("/dev/dri/renderD{}", 128 + i);
2549                            if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2550                                let dev_type =
2551                                    get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2552                                let linux_device = LinuxDeviceBuilder::default()
2553                                    .path(dev_path.clone())
2554                                    .typ(dev_type)
2555                                    .major(major)
2556                                    .minor(minor)
2557                                    .file_mode(0o666u32)
2558                                    .uid(0u32)
2559                                    .gid(0u32)
2560                                    .build()
2561                                    .map_err(|e| {
2562                                        AgentError::InvalidSpec(format!(
2563                                            "failed to build GPU device {dev_path}: {e}"
2564                                        ))
2565                                    })?;
2566                                devices.push(linux_device);
2567                            } else {
2568                                tracing::warn!(
2569                                    "GPU device {} not found on host, skipping",
2570                                    dev_path
2571                                );
2572                            }
2573                        }
2574
2575                        // Intel DRI card nodes: /dev/dri/card0, card1, etc.
2576                        for i in &indices {
2577                            let dev_path = format!("/dev/dri/card{i}");
2578                            if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2579                                let dev_type =
2580                                    get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2581                                let linux_device = LinuxDeviceBuilder::default()
2582                                    .path(dev_path.clone())
2583                                    .typ(dev_type)
2584                                    .major(major)
2585                                    .minor(minor)
2586                                    .file_mode(0o666u32)
2587                                    .uid(0u32)
2588                                    .gid(0u32)
2589                                    .build()
2590                                    .map_err(|e| {
2591                                        AgentError::InvalidSpec(format!(
2592                                            "failed to build GPU device {dev_path}: {e}"
2593                                        ))
2594                                    })?;
2595                                devices.push(linux_device);
2596                            } else {
2597                                tracing::warn!(
2598                                    "GPU device {} not found on host, skipping",
2599                                    dev_path
2600                                );
2601                            }
2602                        }
2603                    }
2604                    other => {
2605                        // Unknown vendor - try DRI render nodes as default
2606                        tracing::warn!(
2607                            vendor = %other,
2608                            "Unknown GPU vendor, attempting DRI device passthrough"
2609                        );
2610                        for i in &indices {
2611                            let dev_path = format!("/dev/dri/renderD{}", 128 + i);
2612                            if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2613                                let dev_type =
2614                                    get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2615                                let linux_device = LinuxDeviceBuilder::default()
2616                                    .path(dev_path.clone())
2617                                    .typ(dev_type)
2618                                    .major(major)
2619                                    .minor(minor)
2620                                    .file_mode(0o666u32)
2621                                    .uid(0u32)
2622                                    .gid(0u32)
2623                                    .build()
2624                                    .map_err(|e| {
2625                                        AgentError::InvalidSpec(format!(
2626                                            "failed to build GPU device {dev_path}: {e}"
2627                                        ))
2628                                    })?;
2629                                devices.push(linux_device);
2630                            } else {
2631                                tracing::warn!(
2632                                    "GPU device {} not found on host, skipping",
2633                                    dev_path
2634                                );
2635                            }
2636                        }
2637                    }
2638                }
2639            }
2640
2641            Ok(devices)
2642        } // end #[cfg(unix)]
2643    }
2644
2645    /// Generate the OCI spec and write config.json to the bundle directory
2646    ///
2647    /// Unlike `build()`, this does NOT create the bundle directory or set up rootfs.
2648    /// Use this when the bundle directory and rootfs already exist (e.g., rootfs was
2649    /// extracted directly by `LayerUnpacker`).
2650    ///
2651    /// # Errors
2652    /// Returns an error if the OCI spec cannot be built or config.json cannot be written.
2653    ///
2654    /// # Returns
2655    /// The path to the bundle directory on success
2656    pub async fn write_config(
2657        &self,
2658        container_id: &ContainerId,
2659        spec: &ServiceSpec,
2660    ) -> Result<PathBuf> {
2661        // Generate OCI runtime spec
2662        let oci_spec = self
2663            .build_spec_only(container_id, spec, &self.volume_paths)
2664            .await?;
2665
2666        // Write config.json
2667        let config_path = self.bundle_dir.join("config.json");
2668        let config_json =
2669            serde_json::to_string_pretty(&oci_spec).map_err(|e| AgentError::CreateFailed {
2670                id: container_id.to_string(),
2671                reason: format!("failed to serialize OCI spec: {e}"),
2672            })?;
2673
2674        fs::write(&config_path, config_json)
2675            .await
2676            .map_err(|e| AgentError::CreateFailed {
2677                id: container_id.to_string(),
2678                reason: format!("failed to write config.json: {e}"),
2679            })?;
2680
2681        tracing::debug!(
2682            "Wrote OCI config.json at {} for container {}",
2683            config_path.display(),
2684            container_id
2685        );
2686
2687        Ok(self.bundle_dir.clone())
2688    }
2689
2690    /// Resolve command from `ServiceSpec` and optional image config following Docker/OCI semantics
2691    ///
2692    /// Resolution order:
2693    /// 1. spec entrypoint + args -> use those
2694    /// 2. spec entrypoint only -> use entrypoint
2695    /// 3. spec args only -> use args
2696    /// 4. `image_config` entrypoint/cmd -> use `image_config.full_command()`
2697    /// 5. fallback to /bin/sh
2698    fn resolve_command_from_spec(
2699        spec: &ServiceSpec,
2700        image_config: Option<&zlayer_registry::ImageConfig>,
2701    ) -> Vec<String> {
2702        let mut args = Vec::new();
2703
2704        match (&spec.command.entrypoint, &spec.command.args) {
2705            (Some(entrypoint), Some(cmd_args)) => {
2706                args.extend_from_slice(entrypoint);
2707                args.extend_from_slice(cmd_args);
2708            }
2709            (Some(entrypoint), None) => {
2710                args.extend_from_slice(entrypoint);
2711            }
2712            (None, Some(cmd_args)) if !cmd_args.is_empty() => {
2713                args.extend_from_slice(cmd_args);
2714            }
2715            _ => {
2716                // No spec command - try image config
2717                if let Some(img_cmd) =
2718                    image_config.and_then(zlayer_registry::ImageConfig::full_command)
2719                {
2720                    if img_cmd.is_empty() {
2721                        args.push("/bin/sh".to_string());
2722                    } else {
2723                        args.extend(img_cmd);
2724                    }
2725                } else {
2726                    args.push("/bin/sh".to_string());
2727                }
2728            }
2729        }
2730
2731        args
2732    }
2733
2734    /// Clean up a bundle directory
2735    ///
2736    /// Removes the bundle directory and all its contents.
2737    ///
2738    /// # Errors
2739    /// Returns an error if the bundle directory cannot be removed.
2740    pub async fn cleanup(&self) -> Result<()> {
2741        if self.bundle_dir.exists() {
2742            fs::remove_dir_all(&self.bundle_dir)
2743                .await
2744                .map_err(|e| AgentError::CreateFailed {
2745                    id: "cleanup".to_string(),
2746                    reason: format!(
2747                        "failed to remove bundle directory {}: {}",
2748                        self.bundle_dir.display(),
2749                        e
2750                    ),
2751                })?;
2752        }
2753        Ok(())
2754    }
2755}
2756
2757/// Create a bundle for a container
2758///
2759/// Convenience function that creates a bundle in the default location.
2760///
2761/// # Errors
2762/// Returns an error if bundle creation fails.
2763///
2764/// # Platform
2765/// Unix-only — wraps [`BundleBuilder::build`], which uses
2766/// `tokio::fs::symlink` (not available on Windows). Windows callers should
2767/// use [`BundleBuilder::build_spec_only`] directly and pipe the result into
2768/// a WSL2 delegate.
2769#[cfg(unix)]
2770pub async fn create_bundle(
2771    container_id: &ContainerId,
2772    spec: &ServiceSpec,
2773    rootfs_path: Option<PathBuf>,
2774) -> Result<PathBuf> {
2775    let mut builder =
2776        BundleBuilder::for_container(container_id).with_host_network(spec.host_network);
2777
2778    if let Some(rootfs) = rootfs_path {
2779        builder = builder.with_rootfs(rootfs);
2780    }
2781
2782    builder.build(container_id, spec).await
2783}
2784
2785/// Clean up a container's bundle
2786///
2787/// Convenience function to remove a bundle from the default location.
2788///
2789/// # Errors
2790/// Returns an error if cleanup fails.
2791pub async fn cleanup_bundle(container_id: &ContainerId) -> Result<()> {
2792    let builder = BundleBuilder::for_container(container_id);
2793    builder.cleanup().await
2794}
2795
2796#[cfg(test)]
2797mod tests {
2798    use super::*;
2799    use zlayer_spec::*;
2800
2801    fn mock_spec() -> ServiceSpec {
2802        serde_yaml::from_str::<DeploymentSpec>(
2803            r"
2804version: v1
2805deployment: test
2806services:
2807  test:
2808    rtype: service
2809    image:
2810      name: test:latest
2811    endpoints:
2812      - name: http
2813        protocol: http
2814        port: 8080
2815",
2816        )
2817        .unwrap()
2818        .services
2819        .remove("test")
2820        .unwrap()
2821    }
2822
2823    #[cfg(target_os = "linux")]
2824    fn mock_spec_with_resources() -> ServiceSpec {
2825        serde_yaml::from_str::<DeploymentSpec>(
2826            r"
2827version: v1
2828deployment: test
2829services:
2830  test:
2831    rtype: service
2832    image:
2833      name: test:latest
2834    resources:
2835      cpu: 0.5
2836      memory: 512Mi
2837    env:
2838      MY_VAR: my_value
2839      ANOTHER: value2
2840    endpoints:
2841      - name: http
2842        protocol: http
2843        port: 8080
2844",
2845        )
2846        .unwrap()
2847        .services
2848        .remove("test")
2849        .unwrap()
2850    }
2851
2852    #[cfg(target_os = "linux")]
2853    fn mock_privileged_spec() -> ServiceSpec {
2854        serde_yaml::from_str::<DeploymentSpec>(
2855            r"
2856version: v1
2857deployment: test
2858services:
2859  test:
2860    rtype: service
2861    image:
2862      name: test:latest
2863    privileged: true
2864    endpoints:
2865      - name: http
2866        protocol: http
2867        port: 8080
2868",
2869        )
2870        .unwrap()
2871        .services
2872        .remove("test")
2873        .unwrap()
2874    }
2875
2876    #[test]
2877    fn test_parse_memory_string() {
2878        assert_eq!(parse_memory_string("512Mi").unwrap(), 512 * 1024 * 1024);
2879        assert_eq!(parse_memory_string("1Gi").unwrap(), 1024 * 1024 * 1024);
2880        assert_eq!(parse_memory_string("2G").unwrap(), 2 * 1000 * 1000 * 1000);
2881        assert_eq!(parse_memory_string("1024").unwrap(), 1024);
2882        assert_eq!(parse_memory_string("512Ki").unwrap(), 512 * 1024);
2883    }
2884
2885    #[test]
2886    fn test_parse_memory_string_errors() {
2887        assert!(parse_memory_string("").is_err());
2888        assert!(parse_memory_string("abc").is_err());
2889        assert!(parse_memory_string("12.5Mi").is_err());
2890    }
2891
2892    #[test]
2893    fn test_generate_resolv_conf_single_nameserver() {
2894        let out = generate_resolv_conf(&["10.42.0.1".to_string()]);
2895        assert_eq!(out, "nameserver 10.42.0.1\noptions edns0\n");
2896    }
2897
2898    #[test]
2899    fn test_generate_resolv_conf_two_nameservers() {
2900        let out = generate_resolv_conf(&["10.42.0.1".to_string(), "fd00::1".to_string()]);
2901        assert_eq!(
2902            out,
2903            "nameserver 10.42.0.1\nnameserver fd00::1\noptions edns0\n"
2904        );
2905    }
2906
2907    #[cfg(target_os = "linux")]
2908    #[tokio::test]
2909    async fn test_build_oci_spec_injects_resolv_conf_mount() {
2910        let dir = tempfile::tempdir().unwrap();
2911        let id = ContainerId::new("test".to_string(), 1);
2912        let mut spec = mock_spec();
2913        spec.dns = vec!["10.42.0.1".to_string()];
2914        let builder = BundleBuilder::new(dir.path().to_path_buf());
2915
2916        let oci_spec = builder
2917            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
2918            .await
2919            .unwrap();
2920
2921        let mounts = oci_spec.mounts().as_ref().expect("mounts present");
2922        let resolv_mount = mounts
2923            .iter()
2924            .find(|m| m.destination() == Path::new("/etc/resolv.conf"))
2925            .expect("resolv.conf mount injected");
2926        let source = resolv_mount.source().as_ref().unwrap();
2927        let written = std::fs::read_to_string(source).unwrap();
2928        assert_eq!(written, "nameserver 10.42.0.1\noptions edns0\n");
2929    }
2930
2931    #[cfg(target_os = "linux")]
2932    #[tokio::test]
2933    async fn test_build_oci_spec_no_resolv_conf_when_dns_empty() {
2934        let dir = tempfile::tempdir().unwrap();
2935        let id = ContainerId::new("test".to_string(), 1);
2936        let spec = mock_spec(); // spec.dns defaults to empty
2937        let builder = BundleBuilder::new(dir.path().to_path_buf());
2938
2939        let oci_spec = builder
2940            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
2941            .await
2942            .unwrap();
2943
2944        let mounts = oci_spec.mounts().as_ref().expect("mounts present");
2945        assert!(
2946            !mounts
2947                .iter()
2948                .any(|m| m.destination() == Path::new("/etc/resolv.conf")),
2949            "no resolv.conf mount should be injected for empty spec.dns"
2950        );
2951    }
2952
2953    #[cfg(target_os = "linux")]
2954    #[tokio::test]
2955    async fn test_build_oci_spec_no_resolv_conf_when_host_network() {
2956        let dir = tempfile::tempdir().unwrap();
2957        let id = ContainerId::new("test".to_string(), 1);
2958        let mut spec = mock_spec();
2959        spec.dns = vec!["10.42.0.1".to_string()];
2960        spec.host_network = true;
2961        let builder = BundleBuilder::new(dir.path().to_path_buf());
2962
2963        let oci_spec = builder
2964            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
2965            .await
2966            .unwrap();
2967
2968        let mounts = oci_spec.mounts().as_ref().expect("mounts present");
2969        assert!(
2970            !mounts
2971                .iter()
2972                .any(|m| m.destination() == Path::new("/etc/resolv.conf")),
2973            "host_network containers must inherit the host resolv.conf"
2974        );
2975    }
2976
2977    #[test]
2978    fn test_bundle_builder_new() {
2979        let builder = BundleBuilder::new("/tmp/test-bundle".into());
2980        assert_eq!(builder.bundle_dir(), Path::new("/tmp/test-bundle"));
2981        assert!(builder.rootfs_path.is_none());
2982    }
2983
2984    #[test]
2985    fn test_bundle_builder_for_container() {
2986        let dirs = zlayer_paths::ZLayerDirs::system_default();
2987        let id = ContainerId::new("myservice".to_string(), 1);
2988        let builder = BundleBuilder::for_container(&id);
2989        assert_eq!(builder.bundle_dir(), dirs.bundles().join("myservice-rep-1"));
2990    }
2991
2992    #[test]
2993    fn test_bundle_builder_with_rootfs() {
2994        let dirs = zlayer_paths::ZLayerDirs::system_default();
2995        let builder = BundleBuilder::new("/tmp/test-bundle".into())
2996            .with_rootfs(dirs.rootfs().join("myimage"));
2997        assert_eq!(builder.rootfs_path, Some(dirs.rootfs().join("myimage")));
2998    }
2999
3000    #[cfg(target_os = "linux")]
3001    #[tokio::test]
3002    async fn test_build_oci_spec_basic() {
3003        let id = ContainerId::new("test".to_string(), 1);
3004        let spec = mock_spec();
3005        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3006
3007        let oci_spec = builder
3008            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3009            .await
3010            .unwrap();
3011
3012        assert_eq!(oci_spec.version(), "1.0.2");
3013        assert!(oci_spec.root().is_some());
3014        assert_eq!(
3015            oci_spec.root().as_ref().unwrap().path(),
3016            std::path::Path::new("rootfs")
3017        );
3018        assert!(oci_spec.process().is_some());
3019        assert!(oci_spec.linux().is_some());
3020    }
3021
3022    #[cfg(target_os = "linux")]
3023    #[tokio::test]
3024    async fn test_build_oci_spec_with_resources() {
3025        let id = ContainerId::new("test".to_string(), 1);
3026        let spec = mock_spec_with_resources();
3027        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3028
3029        let oci_spec = builder
3030            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3031            .await
3032            .unwrap();
3033
3034        // Check that resources are set
3035        let linux = oci_spec.linux().as_ref().unwrap();
3036        let resources = linux.resources().as_ref().unwrap();
3037
3038        // Check CPU
3039        let cpu = resources.cpu().as_ref().unwrap();
3040        assert_eq!(cpu.quota(), Some(50_000)); // 0.5 cores * 100000
3041        assert_eq!(cpu.period(), Some(100_000));
3042
3043        // Check memory
3044        let memory = resources.memory().as_ref().unwrap();
3045        assert_eq!(memory.limit(), Some(512 * 1024 * 1024)); // 512Mi
3046    }
3047
3048    #[cfg(target_os = "linux")]
3049    #[tokio::test]
3050    async fn test_build_oci_spec_privileged() {
3051        let id = ContainerId::new("test".to_string(), 1);
3052        let spec = mock_privileged_spec();
3053        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3054
3055        let oci_spec = builder
3056            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3057            .await
3058            .unwrap();
3059
3060        // Check that all capabilities are set
3061        let process = oci_spec.process().as_ref().unwrap();
3062        let caps = process.capabilities().as_ref().unwrap();
3063        let bounding = caps.bounding().as_ref().unwrap();
3064
3065        // Should have all capabilities
3066        assert!(bounding.contains(&Capability::SysAdmin));
3067        assert!(bounding.contains(&Capability::NetAdmin));
3068
3069        // Check that masked paths are NOT set for privileged
3070        let linux = oci_spec.linux().as_ref().unwrap();
3071        assert!(
3072            linux.masked_paths().is_none() || linux.masked_paths().as_ref().unwrap().is_empty()
3073        );
3074    }
3075
3076    #[cfg(target_os = "linux")]
3077    #[tokio::test]
3078    async fn test_build_oci_spec_environment() {
3079        let id = ContainerId::new("test".to_string(), 1);
3080        let spec = mock_spec_with_resources();
3081        let builder = BundleBuilder::new("/tmp/test-bundle".into())
3082            .with_env("EXTRA_VAR".to_string(), "extra_value".to_string());
3083
3084        let oci_spec = builder
3085            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3086            .await
3087            .unwrap();
3088
3089        let process = oci_spec.process().as_ref().unwrap();
3090        let env = process.env().as_ref().unwrap();
3091
3092        // Check service env vars are present
3093        assert!(env.iter().any(|e| e == "MY_VAR=my_value"));
3094        assert!(env.iter().any(|e| e == "ANOTHER=value2"));
3095        // Check extra env var is present
3096        assert!(env.iter().any(|e| e == "EXTRA_VAR=extra_value"));
3097        // Check PATH is present
3098        assert!(env.iter().any(|e| e.starts_with("PATH=")));
3099    }
3100
3101    #[cfg(target_os = "linux")]
3102    #[tokio::test]
3103    async fn test_build_namespaces() {
3104        let id = ContainerId::new("test".to_string(), 1);
3105        let spec = mock_spec();
3106        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3107
3108        let oci_spec = builder
3109            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3110            .await
3111            .unwrap();
3112        let linux = oci_spec.linux().as_ref().unwrap();
3113        let namespaces = linux.namespaces().as_ref().unwrap();
3114
3115        // Check we have the expected namespaces
3116        let namespace_types: Vec<_> = namespaces
3117            .iter()
3118            .map(oci_spec::runtime::LinuxNamespace::typ)
3119            .collect();
3120        assert!(namespace_types.contains(&LinuxNamespaceType::Pid));
3121        assert!(namespace_types.contains(&LinuxNamespaceType::Ipc));
3122        assert!(namespace_types.contains(&LinuxNamespaceType::Uts));
3123        assert!(namespace_types.contains(&LinuxNamespaceType::Mount));
3124        assert!(namespace_types.contains(&LinuxNamespaceType::Network));
3125    }
3126
3127    #[cfg(target_os = "linux")]
3128    #[tokio::test]
3129    async fn test_build_namespaces_host_network() {
3130        let id = ContainerId::new("test".to_string(), 1);
3131        let spec = mock_spec();
3132        let builder = BundleBuilder::new("/tmp/test-bundle".into()).with_host_network(true);
3133
3134        let oci_spec = builder
3135            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3136            .await
3137            .unwrap();
3138        let linux = oci_spec.linux().as_ref().unwrap();
3139        let namespaces = linux.namespaces().as_ref().unwrap();
3140
3141        // Check we have the expected namespaces (NO Network namespace)
3142        let namespace_types: Vec<_> = namespaces
3143            .iter()
3144            .map(oci_spec::runtime::LinuxNamespace::typ)
3145            .collect();
3146        assert!(namespace_types.contains(&LinuxNamespaceType::Pid));
3147        assert!(namespace_types.contains(&LinuxNamespaceType::Ipc));
3148        assert!(namespace_types.contains(&LinuxNamespaceType::Uts));
3149        assert!(namespace_types.contains(&LinuxNamespaceType::Mount));
3150        assert!(
3151            !namespace_types.contains(&LinuxNamespaceType::Network),
3152            "Network namespace should NOT be present in host_network mode"
3153        );
3154    }
3155
3156    #[test]
3157    fn test_build_default_mounts() {
3158        let spec = mock_spec();
3159        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3160
3161        let mounts = builder.build_default_mounts(&spec).unwrap();
3162
3163        // Check we have the expected mounts
3164        let mount_destinations: Vec<_> = mounts
3165            .iter()
3166            .map(|m| m.destination().to_string_lossy().to_string())
3167            .collect();
3168        assert!(mount_destinations.contains(&"/proc".to_string()));
3169        assert!(mount_destinations.contains(&"/dev".to_string()));
3170        assert!(mount_destinations.contains(&"/dev/pts".to_string()));
3171        assert!(mount_destinations.contains(&"/dev/shm".to_string()));
3172        assert!(mount_destinations.contains(&"/sys".to_string()));
3173    }
3174
3175    #[test]
3176    fn test_build_storage_mounts_bind() {
3177        let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3178            r"
3179version: v1
3180deployment: test
3181services:
3182  test:
3183    image:
3184      name: test:latest
3185    storage:
3186      - type: bind
3187        source: /host/data
3188        target: /app/data
3189        readonly: true
3190",
3191        )
3192        .unwrap()
3193        .services
3194        .remove("test")
3195        .unwrap();
3196
3197        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3198        let volume_paths = std::collections::HashMap::new();
3199
3200        let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3201
3202        assert_eq!(mounts.len(), 1);
3203        assert_eq!(mounts[0].destination().to_string_lossy(), "/app/data");
3204        assert_eq!(
3205            mounts[0]
3206                .source()
3207                .as_ref()
3208                .map(|s| s.to_string_lossy().to_string()),
3209            Some("/host/data".to_string())
3210        );
3211        let options = mounts[0].options().as_ref().unwrap();
3212        assert!(options.contains(&"rbind".to_string()));
3213        assert!(options.contains(&"ro".to_string()));
3214    }
3215
3216    #[test]
3217    fn test_build_storage_mounts_named() {
3218        let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3219            r"
3220version: v1
3221deployment: test
3222services:
3223  test:
3224    image:
3225      name: test:latest
3226    storage:
3227      - type: named
3228        name: my-volume
3229        target: /app/data
3230",
3231        )
3232        .unwrap()
3233        .services
3234        .remove("test")
3235        .unwrap();
3236
3237        let dirs = zlayer_paths::ZLayerDirs::system_default();
3238        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3239        let mut volume_paths = std::collections::HashMap::new();
3240        volume_paths.insert("my-volume".to_string(), dirs.volumes().join("my-volume"));
3241
3242        let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3243
3244        assert_eq!(mounts.len(), 1);
3245        assert_eq!(mounts[0].destination().to_string_lossy(), "/app/data");
3246        assert_eq!(
3247            mounts[0]
3248                .source()
3249                .as_ref()
3250                .map(|s| s.to_string_lossy().to_string()),
3251            Some(
3252                dirs.volumes()
3253                    .join("my-volume")
3254                    .to_string_lossy()
3255                    .into_owned()
3256            )
3257        );
3258    }
3259
3260    #[test]
3261    fn test_build_storage_mounts_tmpfs() {
3262        let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3263            r"
3264version: v1
3265deployment: test
3266services:
3267  test:
3268    image:
3269      name: test:latest
3270    storage:
3271      - type: tmpfs
3272        target: /app/tmp
3273        size: 256Mi
3274        mode: 1777
3275",
3276        )
3277        .unwrap()
3278        .services
3279        .remove("test")
3280        .unwrap();
3281
3282        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3283        let volume_paths = std::collections::HashMap::new();
3284
3285        let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3286
3287        assert_eq!(mounts.len(), 1);
3288        assert_eq!(mounts[0].destination().to_string_lossy(), "/app/tmp");
3289        assert_eq!(mounts[0].typ().as_ref().map(String::as_str), Some("tmpfs"));
3290        let options = mounts[0].options().as_ref().unwrap();
3291        assert!(options.iter().any(|o| o.starts_with("size=")));
3292        assert!(options.iter().any(|o| o.starts_with("mode=")));
3293    }
3294
3295    #[test]
3296    fn test_build_storage_mounts_multiple() {
3297        let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3298            r"
3299version: v1
3300deployment: test
3301services:
3302  test:
3303    image:
3304      name: test:latest
3305    storage:
3306      - type: bind
3307        source: /etc/config
3308        target: /app/config
3309        readonly: true
3310      - type: named
3311        name: app-data
3312        target: /app/data
3313      - type: tmpfs
3314        target: /app/tmp
3315",
3316        )
3317        .unwrap()
3318        .services
3319        .remove("test")
3320        .unwrap();
3321
3322        let dirs = zlayer_paths::ZLayerDirs::system_default();
3323        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3324        let mut volume_paths = std::collections::HashMap::new();
3325        volume_paths.insert("app-data".to_string(), dirs.volumes().join("app-data"));
3326
3327        let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3328
3329        assert_eq!(mounts.len(), 3);
3330
3331        // Verify each mount is correct type
3332        let destinations: Vec<String> = mounts
3333            .iter()
3334            .map(|m| m.destination().to_string_lossy().to_string())
3335            .collect();
3336        assert!(destinations.contains(&"/app/config".to_string()));
3337        assert!(destinations.contains(&"/app/data".to_string()));
3338        assert!(destinations.contains(&"/app/tmp".to_string()));
3339    }
3340
3341    #[test]
3342    fn test_build_storage_mounts_anonymous_missing_path() {
3343        let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3344            r"
3345version: v1
3346deployment: test
3347services:
3348  test:
3349    image:
3350      name: test:latest
3351    storage:
3352      - type: anonymous
3353        target: /app/cache
3354",
3355        )
3356        .unwrap()
3357        .services
3358        .remove("test")
3359        .unwrap();
3360
3361        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3362        let volume_paths = std::collections::HashMap::new(); // No path provided
3363
3364        let result = builder.build_storage_mounts(&spec, &volume_paths);
3365
3366        // Should fail because anonymous volume path not prepared
3367        assert!(result.is_err());
3368    }
3369
3370    #[cfg(target_os = "linux")]
3371    #[tokio::test]
3372    async fn test_oci_spec_includes_storage_mounts() {
3373        let id = ContainerId::new("test".to_string(), 1);
3374        let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3375            r"
3376version: v1
3377deployment: test
3378services:
3379  test:
3380    image:
3381      name: test:latest
3382    storage:
3383      - type: bind
3384        source: /host/data
3385        target: /app/data
3386      - type: tmpfs
3387        target: /app/tmp
3388",
3389        )
3390        .unwrap()
3391        .services
3392        .remove("test")
3393        .unwrap();
3394
3395        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3396        let volume_paths = std::collections::HashMap::new();
3397
3398        let oci_spec = builder
3399            .build_spec_only(&id, &spec, &volume_paths)
3400            .await
3401            .unwrap();
3402
3403        // Verify the OCI spec includes storage mounts
3404        let mounts = oci_spec.mounts().as_ref().unwrap();
3405        let destinations: Vec<String> = mounts
3406            .iter()
3407            .map(|m| m.destination().to_string_lossy().to_string())
3408            .collect();
3409
3410        // Should include both default mounts and storage mounts
3411        assert!(destinations.contains(&"/proc".to_string())); // default
3412        assert!(destinations.contains(&"/dev".to_string())); // default
3413        assert!(destinations.contains(&"/app/data".to_string())); // storage bind
3414        assert!(destinations.contains(&"/app/tmp".to_string())); // storage tmpfs
3415    }
3416
3417    fn mock_gpu_spec(vendor: &str, count: u32) -> ServiceSpec {
3418        let yaml = format!(
3419            "
3420version: v1
3421deployment: test
3422services:
3423  test:
3424    rtype: service
3425    image:
3426      name: test:latest
3427    resources:
3428      gpu:
3429        count: {count}
3430        vendor: {vendor}
3431    endpoints:
3432      - name: http
3433        protocol: http
3434        port: 8080
3435"
3436        );
3437        serde_yaml::from_str::<DeploymentSpec>(&yaml)
3438            .unwrap()
3439            .services
3440            .remove("test")
3441            .unwrap()
3442    }
3443
3444    fn write_nvidia_cdi_fixture(dir: &std::path::Path, json: &str) {
3445        std::fs::write(dir.join("nvidia.json"), json).unwrap();
3446    }
3447
3448    fn nvidia_cdi_fixture() -> &'static str {
3449        r#"{
3450            "cdiVersion": "0.6.0",
3451            "kind": "nvidia.com/gpu",
3452            "devices": [{
3453                "name": "0",
3454                "containerEdits": {
3455                    "deviceNodes": [
3456                        {"path": "/dev/nvidia0", "type": "c", "major": 195, "minor": 0}
3457                    ],
3458                    "env": ["NVIDIA_VISIBLE_DEVICES=0"],
3459                    "hooks": {
3460                        "createContainer": [{
3461                            "path": "/usr/bin/nvidia-container-runtime-hook",
3462                            "args": ["nvidia-container-runtime-hook", "prestart"]
3463                        }]
3464                    }
3465                }
3466            }]
3467        }"#
3468    }
3469
3470    #[cfg(target_os = "linux")]
3471    #[tokio::test]
3472    async fn gpu_spec_translates_to_cdi_device_nodes() {
3473        let dir = tempfile::tempdir().unwrap();
3474        write_nvidia_cdi_fixture(dir.path(), nvidia_cdi_fixture());
3475        let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3476
3477        let id = ContainerId::new("test".to_string(), 1);
3478        let spec = mock_gpu_spec("nvidia", 1);
3479        let builder = BundleBuilder::new("/tmp/test-bundle-cdi".into()).with_cdi_registry(registry);
3480
3481        let oci_spec = builder
3482            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3483            .await
3484            .expect("build with CDI fixture");
3485
3486        // CDI device node merged into linux.devices
3487        let linux = oci_spec.linux().as_ref().expect("linux config present");
3488        let devices = linux.devices().as_ref().expect("devices present");
3489        assert!(
3490            devices
3491                .iter()
3492                .any(|d| d.path() == std::path::Path::new("/dev/nvidia0")),
3493            "expected /dev/nvidia0 from CDI fixture; got {:?}",
3494            devices
3495                .iter()
3496                .map(oci_spec::runtime::LinuxDevice::path)
3497                .collect::<Vec<_>>()
3498        );
3499
3500        // CDI env var merged into process.env
3501        let process = oci_spec.process().as_ref().expect("process present");
3502        let env = process.env().as_ref().expect("env present");
3503        assert!(
3504            env.iter().any(|e| e == "NVIDIA_VISIBLE_DEVICES=0"),
3505            "expected NVIDIA_VISIBLE_DEVICES=0 in env; got {env:?}"
3506        );
3507
3508        // CDI hook merged into hooks.createContainer
3509        let hooks = oci_spec.hooks().as_ref().expect("hooks present");
3510        let create_container = hooks
3511            .create_container()
3512            .as_ref()
3513            .expect("createContainer hooks present");
3514        assert_eq!(create_container.len(), 1);
3515        assert_eq!(
3516            create_container[0].path(),
3517            &std::path::PathBuf::from("/usr/bin/nvidia-container-runtime-hook")
3518        );
3519    }
3520
3521    #[tokio::test]
3522    async fn gpu_spec_with_missing_cdi_returns_error() {
3523        // Empty tempdir — no CDI specs installed at all.
3524        let dir = tempfile::tempdir().unwrap();
3525        let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3526
3527        let id = ContainerId::new("test".to_string(), 1);
3528        let spec = mock_gpu_spec("nvidia", 1);
3529        let builder =
3530            BundleBuilder::new("/tmp/test-bundle-cdi-missing".into()).with_cdi_registry(registry);
3531
3532        let err = builder
3533            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3534            .await
3535            .expect_err("should fail when CDI registry is empty");
3536
3537        match err {
3538            AgentError::InvalidSpec(msg) => {
3539                assert!(
3540                    msg.contains("nvidia") || msg.contains("CDI"),
3541                    "error should mention CDI / vendor; got: {msg}"
3542                );
3543            }
3544            other => panic!("expected InvalidSpec, got {other:?}"),
3545        }
3546    }
3547
3548    #[tokio::test]
3549    async fn gpu_spec_with_unknown_device_returns_error() {
3550        // Spec has device "0" but the request will ask for two GPUs (so the
3551        // resolver will look for "1" and fail).
3552        let dir = tempfile::tempdir().unwrap();
3553        write_nvidia_cdi_fixture(dir.path(), nvidia_cdi_fixture());
3554        let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3555
3556        let id = ContainerId::new("test".to_string(), 1);
3557        let spec = mock_gpu_spec("nvidia", 2);
3558        let builder =
3559            BundleBuilder::new("/tmp/test-bundle-cdi-unknown".into()).with_cdi_registry(registry);
3560
3561        let err = builder
3562            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3563            .await
3564            .expect_err("should fail when device '1' is not declared");
3565        match err {
3566            AgentError::InvalidSpec(msg) => {
3567                assert!(
3568                    msg.contains("'1'") || msg.contains("device"),
3569                    "error should mention the missing device; got: {msg}"
3570                );
3571            }
3572            other => panic!("expected InvalidSpec, got {other:?}"),
3573        }
3574    }
3575
3576    #[cfg(target_os = "linux")]
3577    #[tokio::test]
3578    async fn gpu_spec_with_all_devices_expands_to_all_in_spec() {
3579        // Fixture with two declared devices ("0" and "1").
3580        let dir = tempfile::tempdir().unwrap();
3581        let fixture = r#"{
3582            "cdiVersion": "0.6.0",
3583            "kind": "nvidia.com/gpu",
3584            "devices": [
3585                {
3586                    "name": "0",
3587                    "containerEdits": {
3588                        "env": ["NVIDIA_VISIBLE_DEVICES=0"],
3589                        "deviceNodes": [
3590                            {"path": "/dev/nvidia0", "type": "c", "major": 195, "minor": 0}
3591                        ]
3592                    }
3593                },
3594                {
3595                    "name": "1",
3596                    "containerEdits": {
3597                        "env": ["NVIDIA_VISIBLE_DEVICES=1"],
3598                        "deviceNodes": [
3599                            {"path": "/dev/nvidia1", "type": "c", "major": 195, "minor": 1}
3600                        ]
3601                    }
3602                }
3603            ]
3604        }"#;
3605        write_nvidia_cdi_fixture(dir.path(), fixture);
3606        let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3607
3608        // Resolve "all" via the registry directly to validate expansion
3609        // semantics independently of how we map count -> names.
3610        let edits = registry
3611            .resolve_for_kind("nvidia.com/gpu", &["all".to_string()])
3612            .expect("resolve all");
3613        assert_eq!(edits.len(), 2);
3614
3615        // Now build the bundle for a 2-GPU service and confirm both nodes
3616        // land in linux.devices.
3617        let id = ContainerId::new("test".to_string(), 1);
3618        let spec = mock_gpu_spec("nvidia", 2);
3619        let builder =
3620            BundleBuilder::new("/tmp/test-bundle-cdi-all".into()).with_cdi_registry(registry);
3621
3622        let oci_spec = builder
3623            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3624            .await
3625            .expect("build with 2-device fixture");
3626
3627        let devices = oci_spec
3628            .linux()
3629            .as_ref()
3630            .unwrap()
3631            .devices()
3632            .as_ref()
3633            .expect("devices present");
3634        let paths: Vec<_> = devices.iter().map(|d| d.path().clone()).collect();
3635        assert!(paths.contains(&std::path::PathBuf::from("/dev/nvidia0")));
3636        assert!(paths.contains(&std::path::PathBuf::from("/dev/nvidia1")));
3637    }
3638
3639    /// Build the standard fixture-backed CDI registry used by the MPS /
3640    /// time-slicing tests. Identical to the helper used by the 5.A CDI
3641    /// tests above but expressed as a closure-style helper to keep each test
3642    /// self-contained.
3643    fn build_nvidia_cdi_registry(dir: &std::path::Path) -> std::sync::Arc<crate::cdi::CdiRegistry> {
3644        write_nvidia_cdi_fixture(dir, nvidia_cdi_fixture());
3645        std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir]))
3646    }
3647
3648    #[cfg(target_os = "linux")]
3649    #[tokio::test]
3650    async fn gpu_spec_with_mps_sharing_injects_env_and_mounts() {
3651        // Stage host-side MPS directories in a tempdir so the resolver's
3652        // `is_dir()` check passes without touching /tmp/nvidia-mps on the
3653        // real host.
3654        let cdi_dir = tempfile::tempdir().unwrap();
3655        let mps_root = tempfile::tempdir().unwrap();
3656        let pipe_dir = mps_root.path().join("nvidia-mps");
3657        let log_dir = mps_root.path().join("nvidia-log");
3658        std::fs::create_dir(&pipe_dir).unwrap();
3659        std::fs::create_dir(&log_dir).unwrap();
3660        let registry = build_nvidia_cdi_registry(cdi_dir.path());
3661
3662        let id = ContainerId::new("test".to_string(), 1);
3663        let mut spec = mock_gpu_spec("nvidia", 1);
3664        let gpu = spec.resources.gpu.as_mut().expect("gpu spec set");
3665        gpu.sharing = Some(zlayer_spec::GpuSharingMode::Mps);
3666        gpu.mps_pipe_dir = Some(pipe_dir.to_string_lossy().into_owned());
3667        gpu.mps_log_dir = Some(log_dir.to_string_lossy().into_owned());
3668
3669        let builder =
3670            BundleBuilder::new("/tmp/test-bundle-mps-env".into()).with_cdi_registry(registry);
3671        let oci_spec = builder
3672            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3673            .await
3674            .expect("build with MPS sharing");
3675
3676        let env = oci_spec
3677            .process()
3678            .as_ref()
3679            .and_then(|p| p.env().as_ref())
3680            .expect("env present");
3681        let pipe_expect = format!("CUDA_MPS_PIPE_DIRECTORY={}", pipe_dir.display());
3682        let log_expect = format!("CUDA_MPS_LOG_DIRECTORY={}", log_dir.display());
3683        assert!(
3684            env.iter().any(|e| e == &pipe_expect),
3685            "expected {pipe_expect} in env; got {env:?}"
3686        );
3687        assert!(
3688            env.iter().any(|e| e == &log_expect),
3689            "expected {log_expect} in env; got {env:?}"
3690        );
3691
3692        let mounts = oci_spec.mounts().as_ref().expect("mounts present");
3693        assert!(
3694            mounts
3695                .iter()
3696                .any(|m| m.destination() == &pipe_dir && m.source().as_ref() == Some(&pipe_dir)),
3697            "expected bind mount of MPS pipe dir {}; got destinations {:?}",
3698            pipe_dir.display(),
3699            mounts.iter().map(Mount::destination).collect::<Vec<_>>()
3700        );
3701        assert!(
3702            mounts
3703                .iter()
3704                .any(|m| m.destination() == &log_dir && m.source().as_ref() == Some(&log_dir)),
3705            "expected bind mount of MPS log dir {}",
3706            log_dir.display()
3707        );
3708    }
3709
3710    #[tokio::test]
3711    async fn gpu_spec_with_mps_sharing_fails_when_pipe_dir_missing() {
3712        let cdi_dir = tempfile::tempdir().unwrap();
3713        let registry = build_nvidia_cdi_registry(cdi_dir.path());
3714
3715        let id = ContainerId::new("test".to_string(), 1);
3716        let mut spec = mock_gpu_spec("nvidia", 1);
3717        let gpu = spec.resources.gpu.as_mut().expect("gpu spec set");
3718        gpu.sharing = Some(zlayer_spec::GpuSharingMode::Mps);
3719        // Path that demonstrably does not exist — tempdir() returns a unique
3720        // path so appending "definitely-not-here" gives a guaranteed miss.
3721        let missing = tempfile::tempdir().unwrap();
3722        let missing_path = missing.path().join("definitely-not-here");
3723        gpu.mps_pipe_dir = Some(missing_path.to_string_lossy().into_owned());
3724
3725        let builder =
3726            BundleBuilder::new("/tmp/test-bundle-mps-missing".into()).with_cdi_registry(registry);
3727        let err = builder
3728            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3729            .await
3730            .expect_err("should fail when MPS pipe dir is missing");
3731        match err {
3732            AgentError::GpuSharingUnavailable { mode, reason } => {
3733                assert_eq!(mode, "mps");
3734                assert!(
3735                    reason.contains("pipe") || reason.contains(&missing_path.display().to_string()),
3736                    "reason should mention the missing path; got: {reason}"
3737                );
3738            }
3739            other => panic!("expected GpuSharingUnavailable, got {other:?}"),
3740        }
3741    }
3742
3743    #[cfg(target_os = "linux")]
3744    #[tokio::test]
3745    async fn gpu_spec_with_timeslicing_injects_visible_devices() {
3746        let cdi_dir = tempfile::tempdir().unwrap();
3747        let registry = build_nvidia_cdi_registry(cdi_dir.path());
3748
3749        let id = ContainerId::new("test".to_string(), 1);
3750        let mut spec = mock_gpu_spec("nvidia", 1);
3751        let gpu = spec.resources.gpu.as_mut().expect("gpu spec set");
3752        gpu.sharing = Some(zlayer_spec::GpuSharingMode::TimeSlice);
3753        gpu.time_slice_index = Some(2);
3754
3755        let builder =
3756            BundleBuilder::new("/tmp/test-bundle-timeslice".into()).with_cdi_registry(registry);
3757        let oci_spec = builder
3758            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3759            .await
3760            .expect("build with time-slicing");
3761
3762        let env = oci_spec
3763            .process()
3764            .as_ref()
3765            .and_then(|p| p.env().as_ref())
3766            .expect("env present");
3767        // Time-slicing must clobber any earlier `CUDA_VISIBLE_DEVICES` (e.g.
3768        // the CDI-emitted full-device list) to advertise exactly the slice.
3769        let cuda_entries: Vec<&String> = env
3770            .iter()
3771            .filter(|e| e.starts_with("CUDA_VISIBLE_DEVICES="))
3772            .collect();
3773        assert_eq!(
3774            cuda_entries.len(),
3775            1,
3776            "exactly one CUDA_VISIBLE_DEVICES expected; got {cuda_entries:?}"
3777        );
3778        assert_eq!(cuda_entries[0], "CUDA_VISIBLE_DEVICES=2");
3779    }
3780
3781    #[cfg(target_os = "linux")]
3782    #[tokio::test]
3783    async fn gpu_spec_without_sharing_omits_mps_env() {
3784        let cdi_dir = tempfile::tempdir().unwrap();
3785        let registry = build_nvidia_cdi_registry(cdi_dir.path());
3786
3787        let id = ContainerId::new("test".to_string(), 1);
3788        let spec = mock_gpu_spec("nvidia", 1);
3789        assert!(spec.resources.gpu.as_ref().unwrap().sharing.is_none());
3790
3791        let builder =
3792            BundleBuilder::new("/tmp/test-bundle-no-sharing".into()).with_cdi_registry(registry);
3793        let oci_spec = builder
3794            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3795            .await
3796            .expect("build without sharing");
3797
3798        let env = oci_spec
3799            .process()
3800            .as_ref()
3801            .and_then(|p| p.env().as_ref())
3802            .expect("env present");
3803        assert!(
3804            !env.iter().any(|e| e.starts_with("CUDA_MPS_")),
3805            "no CUDA_MPS_* env should be present without sharing; got {env:?}"
3806        );
3807
3808        // No MPS mount should be added either. The 5.A CDI fixture mounts a
3809        // /dev/nvidia0 device but never bind-mounts /tmp/nvidia-mps; verify
3810        // we don't sneak that in.
3811        let mounts = oci_spec.mounts().as_ref().expect("mounts present");
3812        assert!(
3813            !mounts
3814                .iter()
3815                .any(|m| { m.destination().to_string_lossy().contains("nvidia-mps") }),
3816            "no MPS pipe mount should be present without sharing"
3817        );
3818    }
3819
3820    #[cfg(unix)]
3821    mod subid_tests {
3822        use super::super::read_subid_range;
3823        use std::io::Write;
3824
3825        #[test]
3826        fn read_subid_range_returns_range_for_user() {
3827            let mut tmp = tempfile::NamedTempFile::new().unwrap();
3828            writeln!(tmp, "alice:100000:65536").unwrap();
3829            writeln!(tmp, "bob:165536:65536").unwrap();
3830            tmp.flush().unwrap();
3831            let path = tmp.path().to_str().unwrap();
3832            assert_eq!(read_subid_range(path, "bob"), Some((165_536, 65_536)));
3833            assert_eq!(read_subid_range(path, "alice"), Some((100_000, 65_536)));
3834        }
3835
3836        #[test]
3837        fn read_subid_range_returns_none_for_unknown_user() {
3838            let mut tmp = tempfile::NamedTempFile::new().unwrap();
3839            writeln!(tmp, "alice:100000:65536").unwrap();
3840            tmp.flush().unwrap();
3841            assert_eq!(
3842                read_subid_range(tmp.path().to_str().unwrap(), "carol"),
3843                None
3844            );
3845        }
3846
3847        #[test]
3848        fn read_subid_range_returns_none_on_missing_file() {
3849            assert_eq!(
3850                read_subid_range("/this/path/does/not/exist/subuid", "anyone"),
3851                None
3852            );
3853        }
3854    }
3855}