Skip to main content

zlayer_agent/
bundle.rs

1//! OCI Bundle Creation
2//!
3//! Creates OCI-compliant bundles for container runtimes using libcontainer (youki).
4//! A bundle consists of a directory with:
5//! - config.json: OCI runtime specification
6//! - rootfs/: Container filesystem (symlink or bind mount target)
7
8use crate::cdi::{self, CdiContainerEdits, CdiRegistry};
9use crate::error::{AgentError, Result};
10use crate::runtime::ContainerId;
11use oci_spec::runtime::{
12    Capability, Hook, HookBuilder, Hooks, HooksBuilder, LinuxBuilder, LinuxCapabilitiesBuilder,
13    LinuxCpuBuilder, LinuxDeviceBuilder, LinuxDeviceCgroupBuilder, LinuxDeviceType,
14    LinuxMemoryBuilder, LinuxNamespaceBuilder, LinuxNamespaceType, LinuxResourcesBuilder, Mount,
15    MountBuilder, PosixRlimit, PosixRlimitBuilder, PosixRlimitType, ProcessBuilder, RootBuilder,
16    Spec, SpecBuilder, UserBuilder,
17};
18// `LinuxIdMappingBuilder` is only used by the unix-gated rootless user-namespace
19// helpers below; importing it unconditionally trips dead-code lints on Windows.
20#[cfg(unix)]
21use oci_spec::runtime::LinuxIdMappingBuilder;
22use std::collections::{HashMap, HashSet};
23// `MetadataExt` is only meaningful on Unix-like hosts where `/dev/*` nodes exist
24// and have major/minor numbers. On Windows this module is still built so that
25// `BundleBuilder::build_spec_only` (cross-platform OCI Spec generation) can be
26// called from the WSL2 delegate runtime, which then pipes the generated
27// `config.json` into a Linux WSL2 distro that owns the actual device
28// fingerprint. See G-1 / G-2 in the Windows plan. The import is performed
29// inside `get_device_major_minor` itself to avoid an unused-import warning on
30// non-Unix platforms.
31use std::path::{Path, PathBuf};
32use std::str::FromStr;
33use std::sync::Arc;
34use tokio::fs;
35use zlayer_secrets::SecretsProvider;
36use zlayer_spec::{GpuSharingMode, ServiceSpec, StorageSpec, StorageTier};
37
38/// Default host directory for the NVIDIA MPS control pipe when the spec
39/// doesn't override [`zlayer_spec::GpuSpec::mps_pipe_dir`].
40const DEFAULT_MPS_PIPE_DIR: &str = "/tmp/nvidia-mps";
41
42/// Default host directory for NVIDIA MPS log output when the spec doesn't
43/// override [`zlayer_spec::GpuSpec::mps_log_dir`].
44const DEFAULT_MPS_LOG_DIR: &str = "/tmp/nvidia-log";
45
46/// Container path where a host-supplied NVIDIA time-slicing config YAML is
47/// surfaced (read-only). The file is informational — `ZLayer` doesn't interpret
48/// it; tools running inside the container can read it to discover slice
49/// topology.
50const TIMESLICE_CONFIG_CONTAINER_PATH: &str = "/etc/nvidia/gpu-time-slicing.yaml";
51
52/// Resolved MPS host directories (pipe + log), validated to exist on disk.
53///
54/// Returned by [`resolve_mps_dirs`] only when `GpuSpec.sharing == Mps`. Both
55/// paths are absolute and guaranteed to be directories at the time the
56/// helper ran — callers can bind-mount them directly.
57struct MpsDirs {
58    pipe_dir: PathBuf,
59    log_dir: PathBuf,
60}
61
62/// Resolve and validate the MPS pipe / log directories for a GPU spec.
63///
64/// Returns `Ok(None)` when sharing is not MPS (or absent), `Ok(Some(...))`
65/// when both directories exist on the host, or
66/// [`AgentError::GpuSharingUnavailable`] when either directory is missing.
67///
68/// Defaults to [`DEFAULT_MPS_PIPE_DIR`] / [`DEFAULT_MPS_LOG_DIR`] when the
69/// spec omits explicit paths, matching the convention used by
70/// `nvidia-cuda-mps-control` out of the box.
71fn resolve_mps_dirs(gpu: &zlayer_spec::GpuSpec) -> Result<Option<MpsDirs>> {
72    if gpu.sharing != Some(GpuSharingMode::Mps) {
73        return Ok(None);
74    }
75
76    let pipe_dir = PathBuf::from(gpu.mps_pipe_dir.as_deref().unwrap_or(DEFAULT_MPS_PIPE_DIR));
77    let log_dir = PathBuf::from(gpu.mps_log_dir.as_deref().unwrap_or(DEFAULT_MPS_LOG_DIR));
78
79    if !pipe_dir.is_dir() {
80        return Err(AgentError::GpuSharingUnavailable {
81            mode: "mps".to_string(),
82            reason: format!(
83                "MPS pipe directory {} does not exist; ensure nvidia-cuda-mps-control is running",
84                pipe_dir.display()
85            ),
86        });
87    }
88    if !log_dir.is_dir() {
89        return Err(AgentError::GpuSharingUnavailable {
90            mode: "mps".to_string(),
91            reason: format!(
92                "MPS log directory {} does not exist; ensure nvidia-cuda-mps-control is running",
93                log_dir.display()
94            ),
95        });
96    }
97
98    Ok(Some(MpsDirs { pipe_dir, log_dir }))
99}
100
101/// Convert a CDI device node descriptor into the OCI [`LinuxDevice`] used by
102/// the runtime.
103///
104/// CDI device nodes may omit `type`, `major`, and `minor` — in that case we
105/// probe the host (via `get_device_type` / `get_device_major_minor`) using
106/// the resolved host path, falling back to character device with zero
107/// major/minor when the file is unavailable (typical for test fixtures
108/// that reference paths that don't exist on the build host).
109fn cdi_node_to_oci_device(
110    node: &crate::cdi::CdiDeviceNode,
111) -> Result<oci_spec::runtime::LinuxDevice> {
112    let host_path = node.host_path.as_deref().unwrap_or(&node.path);
113
114    let dev_type = match node.device_type.as_deref() {
115        Some("c" | "u") => LinuxDeviceType::C,
116        Some("b") => LinuxDeviceType::B,
117        Some("p") => LinuxDeviceType::P,
118        _ => get_device_type(host_path).unwrap_or(LinuxDeviceType::C),
119    };
120
121    let (major, minor) = if let (Some(maj), Some(min)) = (node.major, node.minor) {
122        (maj, min)
123    } else {
124        get_device_major_minor(host_path).unwrap_or((0, 0))
125    };
126
127    let mut builder = LinuxDeviceBuilder::default()
128        .path(node.path.clone())
129        .typ(dev_type)
130        .major(major)
131        .minor(minor);
132    if let Some(mode) = node.file_mode {
133        builder = builder.file_mode(mode);
134    } else {
135        builder = builder.file_mode(0o666u32);
136    }
137    builder = builder.uid(node.uid.unwrap_or(0));
138    builder = builder.gid(node.gid.unwrap_or(0));
139
140    builder.build().map_err(|e| {
141        AgentError::InvalidSpec(format!(
142            "failed to build CDI device {path}: {e}",
143            path = node.path
144        ))
145    })
146}
147
148/// Convert a CDI hook descriptor into the OCI [`Hook`] used by the runtime.
149fn convert_cdi_hook(cdi_hook: &crate::cdi::CdiHook) -> Result<Hook> {
150    let mut builder = HookBuilder::default().path(PathBuf::from(&cdi_hook.path));
151    if !cdi_hook.args.is_empty() {
152        builder = builder.args(cdi_hook.args.clone());
153    }
154    if !cdi_hook.env.is_empty() {
155        builder = builder.env(cdi_hook.env.clone());
156    }
157    builder
158        .build()
159        .map_err(|e| AgentError::InvalidSpec(format!("failed to build CDI hook: {e}")))
160}
161
162/// All Linux capabilities for privileged mode
163const ALL_CAPABILITIES: &[Capability] = &[
164    Capability::AuditControl,
165    Capability::AuditRead,
166    Capability::AuditWrite,
167    Capability::BlockSuspend,
168    Capability::Bpf,
169    Capability::CheckpointRestore,
170    Capability::Chown,
171    Capability::DacOverride,
172    Capability::DacReadSearch,
173    Capability::Fowner,
174    Capability::Fsetid,
175    Capability::IpcLock,
176    Capability::IpcOwner,
177    Capability::Kill,
178    Capability::Lease,
179    Capability::LinuxImmutable,
180    Capability::MacAdmin,
181    Capability::MacOverride,
182    Capability::Mknod,
183    Capability::NetAdmin,
184    Capability::NetBindService,
185    Capability::NetBroadcast,
186    Capability::NetRaw,
187    Capability::Perfmon,
188    Capability::Setfcap,
189    Capability::Setgid,
190    Capability::Setpcap,
191    Capability::Setuid,
192    Capability::SysAdmin,
193    Capability::SysBoot,
194    Capability::SysChroot,
195    Capability::SysModule,
196    Capability::SysNice,
197    Capability::SysPacct,
198    Capability::SysPtrace,
199    Capability::SysRawio,
200    Capability::SysResource,
201    Capability::SysTime,
202    Capability::SysTtyConfig,
203    Capability::Syslog,
204    Capability::WakeAlarm,
205];
206
207/// Parse memory string like "512Mi", "1Gi" to bytes
208///
209/// Supports both IEC (binary) and SI (decimal) units:
210/// - IEC: Ki, Mi, Gi, Ti (powers of 1024)
211/// - SI: K/k, M/m, G/g, T/t (powers of 1000)
212/// - No suffix: bytes
213///
214/// # Examples
215/// ```ignore
216/// assert_eq!(parse_memory_string("512Mi").unwrap(), 512 * 1024 * 1024);
217/// assert_eq!(parse_memory_string("1Gi").unwrap(), 1024 * 1024 * 1024);
218/// assert_eq!(parse_memory_string("2G").unwrap(), 2 * 1000 * 1000 * 1000);
219/// ```
220///
221/// Render the contents of an `/etc/resolv.conf` for the given resolver
222/// addresses.
223///
224/// One `nameserver <ip>` line per entry, in order, followed by a single
225/// `options edns0` line (enables EDNS(0) so larger UDP responses — e.g. the
226/// overlay resolver forwarding A/AAAA records — are not truncated). The output
227/// is deliberately minimal: no `search`/`domain` directives, which would
228/// otherwise be inherited from the (hijacked) host resolv.conf we are
229/// replacing.
230///
231/// This exists because youki/libcontainer performs NO resolv.conf handling of
232/// its own — without an explicit bind mount the container sees only whatever
233/// `/etc/resolv.conf` shipped in the image (often empty or absent). The caller
234/// writes this string into the bundle directory and bind-mounts it read-only at
235/// `/etc/resolv.conf`.
236#[must_use]
237pub fn generate_resolv_conf(nameservers: &[String]) -> String {
238    let mut out = String::new();
239    for ns in nameservers {
240        out.push_str("nameserver ");
241        out.push_str(ns);
242        out.push('\n');
243    }
244    out.push_str("options edns0\n");
245    out
246}
247
248/// # Errors
249/// Returns an error if the string cannot be parsed as a memory size.
250pub fn parse_memory_string(s: &str) -> std::result::Result<u64, String> {
251    let s = s.trim();
252    if s.is_empty() {
253        return Err("empty memory string".to_string());
254    }
255
256    let (num_str, multiplier) = if let Some(n) = s.strip_suffix("Ki") {
257        (n, 1024u64)
258    } else if let Some(n) = s.strip_suffix("Mi") {
259        (n, 1024u64 * 1024)
260    } else if let Some(n) = s.strip_suffix("Gi") {
261        (n, 1024u64 * 1024 * 1024)
262    } else if let Some(n) = s.strip_suffix("Ti") {
263        (n, 1024u64 * 1024 * 1024 * 1024)
264    } else if let Some(n) = s.strip_suffix('K').or_else(|| s.strip_suffix('k')) {
265        (n, 1000u64)
266    } else if let Some(n) = s.strip_suffix('M').or_else(|| s.strip_suffix('m')) {
267        (n, 1000u64 * 1000)
268    } else if let Some(n) = s.strip_suffix('G').or_else(|| s.strip_suffix('g')) {
269        (n, 1000u64 * 1000 * 1000)
270    } else if let Some(n) = s.strip_suffix('T').or_else(|| s.strip_suffix('t')) {
271        (n, 1000u64 * 1000 * 1000 * 1000)
272    } else {
273        (s, 1u64)
274    };
275
276    let num: u64 = num_str
277        .parse()
278        .map_err(|e| format!("invalid number: {e}"))?;
279
280    Ok(num * multiplier)
281}
282
283/// Get major and minor device numbers from a device path
284///
285/// Unix-only: relies on `MetadataExt::rdev()` which isn't available on Windows.
286/// When `bundle.rs` is compiled for a Windows host (for the WSL2 delegate's
287/// cross-platform `build_spec_only` path), device probing is skipped entirely —
288/// the Linux side of the delegate is responsible for its own device fingerprint.
289/// The non-Unix stub below returns `Unsupported` so the `if let Ok(..)` /
290/// `.unwrap_or(..)` call sites at the CDI / GPU passthrough paths skip cleanly.
291#[cfg(unix)]
292#[allow(clippy::cast_possible_wrap)]
293fn get_device_major_minor(path: &str) -> std::io::Result<(i64, i64)> {
294    use std::os::unix::fs::MetadataExt;
295    let metadata = std::fs::metadata(path)?;
296    let rdev = metadata.rdev();
297    // Major is upper 8 bits (after shifting), minor is lower 8 bits
298    let major = ((rdev >> 8) & 0xff) as i64;
299    let minor = (rdev & 0xff) as i64;
300    Ok((major, minor))
301}
302
303/// Non-Unix stub: device-cgroup probes require Unix; callers use `if let Ok(..)` to skip.
304#[cfg(not(unix))]
305fn get_device_major_minor(_path: &str) -> std::io::Result<(i64, i64)> {
306    Err(std::io::Error::new(
307        std::io::ErrorKind::Unsupported,
308        "device-cgroup probes require Unix",
309    ))
310}
311
312/// Translate the Docker `--ulimit <name>` style key into the OCI
313/// `PosixRlimitType` enum. Returns `None` for unknown names so the caller
314/// can surface a clean error.
315fn ulimit_name_to_posix(name: &str) -> Option<PosixRlimitType> {
316    Some(match name.to_ascii_lowercase().as_str() {
317        "cpu" => PosixRlimitType::RlimitCpu,
318        "fsize" => PosixRlimitType::RlimitFsize,
319        "data" => PosixRlimitType::RlimitData,
320        "stack" => PosixRlimitType::RlimitStack,
321        "core" => PosixRlimitType::RlimitCore,
322        "rss" => PosixRlimitType::RlimitRss,
323        "nproc" => PosixRlimitType::RlimitNproc,
324        "nofile" => PosixRlimitType::RlimitNofile,
325        "memlock" => PosixRlimitType::RlimitMemlock,
326        "as" => PosixRlimitType::RlimitAs,
327        "locks" => PosixRlimitType::RlimitLocks,
328        "sigpending" => PosixRlimitType::RlimitSigpending,
329        "msgqueue" => PosixRlimitType::RlimitMsgqueue,
330        "nice" => PosixRlimitType::RlimitNice,
331        "rtprio" => PosixRlimitType::RlimitRtprio,
332        "rttime" => PosixRlimitType::RlimitRttime,
333        _ => return None,
334    })
335}
336
337#[cfg(test)]
338mod ulimit_translation_tests {
339    use super::{ulimit_name_to_posix, PosixRlimitType};
340
341    #[test]
342    fn known_names_map() {
343        assert_eq!(
344            ulimit_name_to_posix("nofile"),
345            Some(PosixRlimitType::RlimitNofile)
346        );
347        assert_eq!(
348            ulimit_name_to_posix("NOFILE"),
349            Some(PosixRlimitType::RlimitNofile)
350        );
351        assert_eq!(
352            ulimit_name_to_posix("nproc"),
353            Some(PosixRlimitType::RlimitNproc)
354        );
355        assert_eq!(ulimit_name_to_posix("as"), Some(PosixRlimitType::RlimitAs));
356    }
357
358    #[test]
359    fn unknown_names_return_none() {
360        assert!(ulimit_name_to_posix("not_a_real_ulimit").is_none());
361        assert!(ulimit_name_to_posix("").is_none());
362    }
363}
364
365/// Detect device type from path
366///
367/// Unix-only: uses `FileTypeExt::is_char_device` / `is_block_device` which are
368/// not available on Windows. See `get_device_major_minor` for the rationale.
369#[cfg(unix)]
370fn get_device_type(path: &str) -> std::io::Result<LinuxDeviceType> {
371    use std::os::unix::fs::FileTypeExt;
372    let metadata = std::fs::metadata(path)?;
373    let file_type = metadata.file_type();
374    if file_type.is_char_device() {
375        Ok(LinuxDeviceType::C)
376    } else if file_type.is_block_device() {
377        Ok(LinuxDeviceType::B)
378    } else {
379        Ok(LinuxDeviceType::U) // Unknown/other
380    }
381}
382
383/// Non-Unix stub: device-cgroup probes require Unix; callers use `.unwrap_or(..)` to skip.
384#[cfg(not(unix))]
385fn get_device_type(_path: &str) -> std::io::Result<LinuxDeviceType> {
386    Err(std::io::Error::new(
387        std::io::ErrorKind::Unsupported,
388        "device-cgroup probes require Unix",
389    ))
390}
391
392/// Builder for OCI container bundles
393///
394/// Creates the directory structure and config.json required for OCI-compliant
395/// container runtimes like runc or youki.
396///
397/// # Example
398/// ```ignore
399/// let dirs = zlayer_paths::ZLayerDirs::system_default();
400/// let builder = BundleBuilder::new(dirs.bundles().join("mycontainer"))
401///     .with_rootfs(dirs.rootfs().join("myimage"));
402///
403/// let bundle_path = builder.build(&container_id, &service_spec).await?;
404/// ```
405#[derive(Clone)]
406pub struct BundleBuilder {
407    /// Base directory for the bundle
408    bundle_dir: PathBuf,
409    /// Path to the unpacked rootfs (from image layers)
410    rootfs_path: Option<PathBuf>,
411    /// Custom hostname (defaults to container ID)
412    hostname: Option<String>,
413    /// Additional environment variables
414    extra_env: Vec<(String, String)>,
415    /// Custom working directory
416    cwd: Option<String>,
417    /// Custom command/args to run (overrides image default)
418    args: Option<Vec<String>>,
419    /// Pre-resolved volume paths from `StorageManager`
420    volume_paths: HashMap<String, PathBuf>,
421    /// Image configuration from the OCI registry (entrypoint, cmd, env, workdir, user)
422    image_config: Option<zlayer_registry::ImageConfig>,
423    /// Use host networking (skip Network namespace, container shares host network)
424    host_network: bool,
425    /// Secrets provider for resolving $S: prefixed env vars
426    secrets_provider: Option<Arc<dyn SecretsProvider>>,
427    /// Deployment scope for secret lookups (e.g., deployment name)
428    deployment_scope: Option<String>,
429    /// Host-side Unix socket path to bind-mount into the container
430    socket_path: Option<String>,
431    /// Optional CDI registry override (defaults to discovery from system paths).
432    ///
433    /// Wrapped in `Arc` so [`BundleBuilder`] can stay [`Clone`]. Primarily set
434    /// in tests via [`BundleBuilder::with_cdi_registry`]; production paths
435    /// leave this `None` and lazy-discover via [`CdiRegistry::discover`] when
436    /// a `GpuSpec` is present.
437    cdi_registry: Option<Arc<CdiRegistry>>,
438}
439
440impl std::fmt::Debug for BundleBuilder {
441    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
442        f.debug_struct("BundleBuilder")
443            .field("bundle_dir", &self.bundle_dir)
444            .field("rootfs_path", &self.rootfs_path)
445            .field("hostname", &self.hostname)
446            .field("extra_env", &self.extra_env)
447            .field("cwd", &self.cwd)
448            .field("args", &self.args)
449            .field("volume_paths", &self.volume_paths)
450            .field("image_config", &self.image_config)
451            .field("host_network", &self.host_network)
452            .field("secrets_provider", &self.secrets_provider.is_some())
453            .field("deployment_scope", &self.deployment_scope)
454            .field("socket_path", &self.socket_path)
455            .field("cdi_registry", &self.cdi_registry.is_some())
456            .finish()
457    }
458}
459
460/// Build OCI `uid_mappings` (or `gid_mappings` — same structure) for a rootless
461/// container. Always emits a single-id mapping (container 0 → `host_id`, size 1).
462/// If `username` has an entry in `subid_path` (e.g. /etc/subuid), appends a
463/// range mapping (container 1 → range start, size = range count).
464///
465/// Rootless user-namespace mapping is a Linux/libcontainer concept; on Windows
466/// containers run via HCS so this helper is unix-only.
467#[cfg(unix)]
468fn build_rootless_id_mappings(
469    host_id: u32,
470    subid_path: &str,
471    username: &str,
472) -> Vec<oci_spec::runtime::LinuxIdMapping> {
473    let mut mappings = vec![LinuxIdMappingBuilder::default()
474        .container_id(0_u32)
475        .host_id(host_id)
476        .size(1_u32)
477        .build()
478        .unwrap()];
479    if !username.is_empty() {
480        if let Some((start, count)) = read_subid_range(subid_path, username) {
481            mappings.push(
482                LinuxIdMappingBuilder::default()
483                    .container_id(1_u32)
484                    .host_id(start)
485                    .size(count)
486                    .build()
487                    .unwrap(),
488            );
489        }
490    }
491    mappings
492}
493
494/// Read /etc/subuid (or /etc/subgid) and return the (start, count) range
495/// allocated to the given username, if any. Returns None on any I/O error
496/// or when the user has no entry — callers must fall back to a single-id
497/// mapping in that case.
498///
499/// Subuid files are a Linux concept and the only caller is the unix-gated
500/// `build_rootless_id_mappings`, so this helper is unix-only as well.
501#[cfg(unix)]
502fn read_subid_range(path: &str, username: &str) -> Option<(u32, u32)> {
503    let contents = std::fs::read_to_string(path).ok()?;
504    for line in contents.lines() {
505        let mut parts = line.splitn(3, ':');
506        let user = parts.next()?;
507        if user != username {
508            continue;
509        }
510        let start: u32 = parts.next()?.parse().ok()?;
511        let count: u32 = parts.next()?.parse().ok()?;
512        return Some((start, count));
513    }
514    None
515}
516
517impl BundleBuilder {
518    /// Create a new `BundleBuilder` with the specified bundle directory
519    ///
520    /// The bundle directory will be created if it doesn't exist.
521    /// The structure will be:
522    /// ```text
523    /// {bundle_dir}/
524    /// ├── config.json
525    /// └── rootfs/  (symlink to actual rootfs or mount point)
526    /// ```
527    #[must_use]
528    pub fn new(bundle_dir: PathBuf) -> Self {
529        Self {
530            bundle_dir,
531            rootfs_path: None,
532            hostname: None,
533            extra_env: Vec::new(),
534            cwd: None,
535            args: None,
536            volume_paths: HashMap::new(),
537            image_config: None,
538            host_network: false,
539            secrets_provider: None,
540            deployment_scope: None,
541            socket_path: None,
542            cdi_registry: None,
543        }
544    }
545
546    /// Override the CDI registry used for GPU device resolution.
547    ///
548    /// When unset, [`build_oci_spec`](Self::build_oci_spec) discovers CDI
549    /// specs lazily from the standard system search paths (`/etc/cdi`,
550    /// `/var/run/cdi`, plus `$CDI_SPEC_DIRS`). Tests use this setter to
551    /// inject fixture-backed registries pointed at a temp directory.
552    #[must_use]
553    pub fn with_cdi_registry(mut self, registry: Arc<CdiRegistry>) -> Self {
554        self.cdi_registry = Some(registry);
555        self
556    }
557
558    /// Create a `BundleBuilder` for a container in the default bundle location
559    #[must_use]
560    pub fn for_container(container_id: &ContainerId) -> Self {
561        let bundle_dir = zlayer_paths::ZLayerDirs::system_default()
562            .bundles()
563            .join(container_id.to_string());
564        Self::new(bundle_dir)
565    }
566
567    /// Set the rootfs path (from unpacked image layers)
568    ///
569    /// This path will be symlinked into the bundle as `rootfs/`
570    #[must_use]
571    pub fn with_rootfs(mut self, rootfs_path: PathBuf) -> Self {
572        self.rootfs_path = Some(rootfs_path);
573        self
574    }
575
576    /// Set a custom hostname for the container
577    #[must_use]
578    pub fn with_hostname(mut self, hostname: String) -> Self {
579        self.hostname = Some(hostname);
580        self
581    }
582
583    /// Add extra environment variables
584    #[must_use]
585    pub fn with_env(mut self, key: String, value: String) -> Self {
586        self.extra_env.push((key, value));
587        self
588    }
589
590    /// Set the working directory
591    #[must_use]
592    pub fn with_cwd(mut self, cwd: String) -> Self {
593        self.cwd = Some(cwd);
594        self
595    }
596
597    /// Set the command/args to run
598    #[must_use]
599    pub fn with_args(mut self, args: Vec<String>) -> Self {
600        self.args = Some(args);
601        self
602    }
603
604    /// Set pre-resolved volume paths from `StorageManager`
605    ///
606    /// These are used to map named/anonymous/S3 volumes to their host paths
607    /// when building storage mounts in the OCI spec.
608    #[must_use]
609    pub fn with_volume_paths(mut self, volume_paths: HashMap<String, PathBuf>) -> Self {
610        self.volume_paths = volume_paths;
611        self
612    }
613
614    /// Set the OCI image configuration (entrypoint, cmd, env, workdir, user)
615    ///
616    /// When set, the image config provides defaults for the container process
617    /// that are used when the deployment spec doesn't override them.
618    #[must_use]
619    pub fn with_image_config(mut self, config: zlayer_registry::ImageConfig) -> Self {
620        self.image_config = Some(config);
621        self
622    }
623
624    /// Enable host networking mode
625    ///
626    /// When true, the container will NOT get its own network namespace and will
627    /// share the host's network stack. This is equivalent to Docker's `--network host`.
628    /// Use this when overlay networking is unavailable or not desired.
629    #[must_use]
630    pub fn with_host_network(mut self, host_network: bool) -> Self {
631        self.host_network = host_network;
632        self
633    }
634
635    /// Set the secrets provider for resolving `$S:` prefixed environment variables
636    ///
637    /// When set, environment variables with `$S:secret-name` syntax will be resolved
638    /// from this provider at bundle creation time.
639    #[must_use]
640    pub fn with_secrets_provider(mut self, provider: Arc<dyn SecretsProvider>) -> Self {
641        self.secrets_provider = Some(provider);
642        self
643    }
644
645    /// Set the deployment scope for secret lookups
646    ///
647    /// This is typically the deployment name and is used as the scope when
648    /// resolving `$S:` prefixed environment variables.
649    #[must_use]
650    pub fn with_deployment_scope(mut self, scope: String) -> Self {
651        self.deployment_scope = Some(scope);
652        self
653    }
654
655    /// Set a host-side Unix socket path to bind-mount into the container at
656    /// the default `ZLayer` socket path (read-only).
657    #[must_use]
658    pub fn with_socket_mount(mut self, path: impl Into<String>) -> Self {
659        self.socket_path = Some(path.into());
660        self
661    }
662
663    /// Get the bundle directory path
664    #[must_use]
665    pub fn bundle_dir(&self) -> &Path {
666        &self.bundle_dir
667    }
668
669    /// Build the OCI bundle from a `ServiceSpec`
670    ///
671    /// Creates the bundle directory structure and generates config.json
672    /// based on the provided service specification.
673    ///
674    /// # Returns
675    /// The path to the bundle directory on success
676    ///
677    /// # Errors
678    /// - `AgentError::CreateFailed` if directory creation fails
679    /// - `AgentError::InvalidSpec` if the OCI spec generation fails
680    ///
681    /// # Platform
682    /// Unix-only. Uses `tokio::fs::symlink` which is defined in terms of
683    /// `std::os::unix::fs::symlink` and does not exist on Windows. The Windows
684    /// WSL2 delegate path should call [`BundleBuilder::build_spec_only`] to
685    /// obtain the OCI [`Spec`] and pipe it into the WSL2 distro, where the
686    /// Linux side of the delegate handles bundle directory creation.
687    #[cfg(unix)]
688    pub async fn build(&self, container_id: &ContainerId, spec: &ServiceSpec) -> Result<PathBuf> {
689        // Create bundle directory
690        fs::create_dir_all(&self.bundle_dir)
691            .await
692            .map_err(|e| AgentError::CreateFailed {
693                id: container_id.to_string(),
694                reason: format!("failed to create bundle directory: {e}"),
695            })?;
696
697        // Set up rootfs (symlink or create empty directory)
698        let rootfs_in_bundle = self.bundle_dir.join("rootfs");
699        if let Some(ref rootfs_path) = self.rootfs_path {
700            // Remove existing rootfs symlink/dir if present
701            let _ = fs::remove_file(&rootfs_in_bundle).await;
702            let _ = fs::remove_dir(&rootfs_in_bundle).await;
703
704            // Create symlink to actual rootfs.
705            // On Unix: `tokio::fs::symlink` (unified file/dir symlink).
706            // On Windows: `tokio::fs::symlink_dir` (wraps CreateSymbolicLinkW with
707            // SYMBOLIC_LINK_FLAG_DIRECTORY) — rootfs is always an OCI layer directory.
708            #[cfg(unix)]
709            tokio::fs::symlink(rootfs_path, &rootfs_in_bundle)
710                .await
711                .map_err(|e| AgentError::CreateFailed {
712                    id: container_id.to_string(),
713                    reason: format!(
714                        "failed to symlink rootfs from {} to {}: {}",
715                        rootfs_path.display(),
716                        rootfs_in_bundle.display(),
717                        e
718                    ),
719                })?;
720
721            #[cfg(windows)]
722            tokio::fs::symlink_dir(rootfs_path, &rootfs_in_bundle)
723                .await
724                .map_err(|e| AgentError::CreateFailed {
725                    id: container_id.to_string(),
726                    reason: format!(
727                        "failed to symlink rootfs from {} to {}: {}",
728                        rootfs_path.display(),
729                        rootfs_in_bundle.display(),
730                        e
731                    ),
732                })?;
733        } else {
734            // Create empty rootfs directory (for bind mounts)
735            fs::create_dir_all(&rootfs_in_bundle)
736                .await
737                .map_err(|e| AgentError::CreateFailed {
738                    id: container_id.to_string(),
739                    reason: format!("failed to create rootfs directory: {e}"),
740                })?;
741        }
742
743        // Generate OCI runtime spec
744        let oci_spec = self
745            .build_spec_only(container_id, spec, &self.volume_paths)
746            .await?;
747
748        // Write config.json
749        let config_path = self.bundle_dir.join("config.json");
750        let config_json =
751            serde_json::to_string_pretty(&oci_spec).map_err(|e| AgentError::CreateFailed {
752                id: container_id.to_string(),
753                reason: format!("failed to serialize OCI spec: {e}"),
754            })?;
755
756        fs::write(&config_path, config_json)
757            .await
758            .map_err(|e| AgentError::CreateFailed {
759                id: container_id.to_string(),
760                reason: format!("failed to write config.json: {e}"),
761            })?;
762
763        tracing::debug!(
764            "Created OCI bundle at {} for container {}",
765            self.bundle_dir.display(),
766            container_id
767        );
768
769        Ok(self.bundle_dir.clone())
770    }
771
772    /// Render the OCI runtime spec without creating a bundle directory
773    /// or writing `config.json`.
774    ///
775    /// This is the cross-platform entry point for OCI spec generation and is
776    /// the only bundle-builder method that is callable on Windows. Used by the
777    /// WSL2 delegate runtime (`runtimes/wsl2_delegate.rs`): the Windows host
778    /// renders the spec, then streams the JSON into the WSL distro filesystem
779    /// where `youki` will consume it. The bundle path passed to
780    /// `BundleBuilder::new` is purely informational in that flow; this method
781    /// never touches the filesystem.
782    ///
783    /// Unix hosts that want both the spec *and* the on-disk bundle layout
784    /// (rootfs symlink, `config.json`, parent directories) should continue to
785    /// use [`BundleBuilder::build`] or [`BundleBuilder::write_config`].
786    ///
787    /// # Errors
788    /// Returns [`AgentError::InvalidSpec`] if any of the OCI `*Builder` types
789    /// reject the configuration, or if environment-variable secret resolution
790    /// fails.
791    pub async fn build_spec_only(
792        &self,
793        container_id: &ContainerId,
794        spec: &ServiceSpec,
795        volume_paths: &std::collections::HashMap<String, PathBuf>,
796    ) -> Result<oci_spec::runtime::Spec> {
797        self.build_oci_spec(container_id, spec, volume_paths).await
798    }
799
800    /// Resolve CDI edits for a service spec's GPU request, if any.
801    ///
802    /// Returns:
803    /// - `Ok(None)` when the spec has no `GpuSpec`, when the vendor isn't a
804    ///   known CDI-published kind (e.g. `"apple"`), or when no explicit
805    ///   registry was set and lazy discovery turned up no installed specs
806    ///   (production fallback — baked-in defaults take over).
807    /// - `Ok(Some(vec))` with one entry per requested device when CDI specs
808    ///   are available and resolution succeeds.
809    /// - `Err(AgentError::InvalidSpec(...))` when the caller explicitly opted
810    ///   into CDI (via `with_cdi_registry`) but the resolution fails —
811    ///   surfaces [`cdi::CdiError::SpecMissing`] /
812    ///   [`cdi::CdiError::DeviceMissing`] / [`cdi::CdiError::NoDevices`] as
813    ///   actionable strings.
814    fn resolve_cdi_edits(&self, spec: &ServiceSpec) -> Result<Option<Vec<CdiContainerEdits>>> {
815        let Some(ref gpu) = spec.resources.gpu else {
816            return Ok(None);
817        };
818
819        // Map short vendor to CDI kind. Unknown vendors (e.g. "apple") fall
820        // back to baked-in behavior.
821        let Some(kind) = cdi::vendor_to_cdi_kind(&gpu.vendor) else {
822            return Ok(None);
823        };
824
825        // Decide registry source:
826        // - Explicit override: strict mode. Missing kind/device == hard error.
827        // - Lazy discover: opportunistic. Missing kind == silent fallback to
828        //   baked-in defaults so prod hosts without CDI installed keep
829        //   working.
830        let (registry, strict) = if let Some(reg) = &self.cdi_registry {
831            (reg.clone(), true)
832        } else {
833            let reg = Arc::new(CdiRegistry::discover());
834            if reg.is_empty() {
835                return Ok(None);
836            }
837            (reg, false)
838        };
839
840        let device_names: Vec<String> = (0..gpu.count).map(|i| i.to_string()).collect();
841
842        match registry.resolve_for_kind(kind, &device_names) {
843            Ok(edits) => Ok(Some(edits)),
844            Err(err) => {
845                if strict {
846                    Err(AgentError::InvalidSpec(format!(
847                        "CDI resolution failed for vendor '{}': {err}",
848                        gpu.vendor
849                    )))
850                } else {
851                    tracing::warn!(
852                        vendor = %gpu.vendor,
853                        kind = %kind,
854                        error = %err,
855                        "CDI resolution failed; falling back to baked-in GPU device passthrough"
856                    );
857                    Ok(None)
858                }
859            }
860        }
861    }
862
863    /// Build the OCI runtime spec from `ServiceSpec`.
864    ///
865    /// The full, CDI-aware implementation that backs both
866    /// [`BundleBuilder::build_spec_only`] (cross-platform, public) and the
867    /// Unix-only [`BundleBuilder::build`] / [`BundleBuilder::write_config`]
868    /// paths that additionally manage the bundle directory on disk.
869    ///
870    /// # Errors
871    /// Returns [`AgentError::InvalidSpec`] if any of the OCI `*Builder` types
872    /// reject the configuration, or if environment-variable secret resolution
873    /// fails.
874    ///
875    /// # Panics
876    /// Panics if the builder-internal `MountBuilder::build()` call fails for
877    /// the optional `ZLayer` API socket bind-mount. This is only reachable when
878    /// [`BundleBuilder::with_socket_mount`] has been used with a malformed
879    /// path, and is treated as a programmer error because all fields are
880    /// statically constructed from known-good inputs.
881    #[allow(clippy::too_many_lines)]
882    async fn build_oci_spec(
883        &self,
884        container_id: &ContainerId,
885        spec: &ServiceSpec,
886        volume_paths: &std::collections::HashMap<String, PathBuf>,
887    ) -> Result<Spec> {
888        // Resolve CDI edits up front. When present, these replace the
889        // baked-in vendor device-node / env injection below; when absent
890        // (no CDI installed, unknown vendor), the legacy code paths run.
891        let cdi_edits = self.resolve_cdi_edits(spec)?;
892
893        // Build user: image config user > root (spec doesn't currently have user override)
894        let user = {
895            let (uid, gid) = if let Some(user_str) = self
896                .image_config
897                .as_ref()
898                .and_then(|c| c.user.as_ref())
899                .filter(|u| !u.is_empty())
900            {
901                // Parse "uid:gid" or "uid" format from image config
902                let parts: Vec<&str> = user_str.splitn(2, ':').collect();
903                let uid = parts[0].parse::<u32>().unwrap_or(0);
904                let gid = if parts.len() > 1 {
905                    parts[1].parse::<u32>().unwrap_or(0)
906                } else {
907                    uid
908                };
909                (uid, gid)
910            } else {
911                (0u32, 0u32)
912            };
913
914            UserBuilder::default()
915                .uid(uid)
916                .gid(gid)
917                .build()
918                .map_err(|e| AgentError::InvalidSpec(format!("failed to build user: {e}")))?
919        };
920
921        // Build environment variables
922        // Layer: image config env (base) -> defaults -> spec env -> builder extra env
923        let mut env: Vec<String> = Vec::new();
924        let mut env_keys: HashSet<String> = HashSet::new();
925
926        // Seed with image config env first (lowest priority)
927        if let Some(img_env) = self.image_config.as_ref().and_then(|c| c.env.as_ref()) {
928            for entry in img_env {
929                if let Some(key) = entry.split('=').next() {
930                    env_keys.insert(key.to_string());
931                }
932                env.push(entry.clone());
933            }
934        }
935
936        // If image config didn't provide PATH, add the default
937        if !env_keys.contains("PATH") {
938            env.push(
939                "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(),
940            );
941            env_keys.insert("PATH".to_string());
942        }
943
944        // Add TERM for interactive compatibility (if not already set)
945        if !env_keys.contains("TERM") {
946            env.push("TERM=xterm".to_string());
947            env_keys.insert("TERM".to_string());
948        }
949
950        // Add service-specific env vars, resolving $S: and $E: prefixed references
951        // These override image config env for same keys
952        //
953        // When a secrets provider is available, use the full secrets-aware resolver
954        // that handles both $S: (secret) and $E: (env) prefixed values.
955        // Otherwise fall back to the env-only resolver.
956        if let (Some(secrets_provider), Some(scope)) =
957            (&self.secrets_provider, &self.deployment_scope)
958        {
959            let resolved_map =
960                crate::env::resolve_env_with_secrets(&spec.env, secrets_provider.as_ref(), scope)
961                    .await
962                    .map_err(|e| {
963                        AgentError::InvalidSpec(format!(
964                            "environment variable resolution failed: {e}"
965                        ))
966                    })?;
967
968            for (key, value) in &resolved_map {
969                if env_keys.contains(key.as_str()) {
970                    env.retain(|e| e.split('=').next() != Some(key.as_str()));
971                }
972                env_keys.insert(key.clone());
973                env.push(format!("{key}={value}"));
974            }
975        } else {
976            let resolved = crate::env::resolve_env_vars_with_warnings(&spec.env).map_err(|e| {
977                AgentError::InvalidSpec(format!("environment variable resolution failed: {e}"))
978            })?;
979
980            // Log any warnings about resolved env vars
981            for warning in &resolved.warnings {
982                tracing::warn!(container = %container_id, "{}", warning);
983            }
984
985            // Merge spec env: spec values take precedence over image config for same keys
986            for var in &resolved.vars {
987                if let Some(key) = var.split('=').next() {
988                    if env_keys.contains(key) {
989                        // Remove the old entry from image config
990                        env.retain(|e| e.split('=').next() != Some(key));
991                    }
992                    env_keys.insert(key.to_string());
993                }
994                env.push(var.clone());
995            }
996        }
997
998        // Add extra env vars from builder (highest priority)
999        for (key, value) in &self.extra_env {
1000            if env_keys.contains(key.as_str()) {
1001                env.retain(|e| e.split('=').next() != Some(key.as_str()));
1002            }
1003            env_keys.insert(key.clone());
1004            env.push(format!("{key}={value}"));
1005        }
1006
1007        // GPU device visibility environment variables.
1008        //
1009        // When CDI edits are available, the vendor-supplied spec is the
1010        // source of truth (e.g. NVIDIA's `nvidia-ctk cdi generate` emits
1011        // `NVIDIA_VISIBLE_DEVICES` plus driver-capability env on every
1012        // device entry). Otherwise fall back to the historical baked-in
1013        // strings so non-CDI hosts continue to advertise the right devices
1014        // to CUDA/ROCm/oneAPI runtimes.
1015        if let Some(ref edits_per_device) = cdi_edits {
1016            for edits in edits_per_device {
1017                for entry in &edits.env {
1018                    if let Some(key) = entry.split('=').next() {
1019                        if env_keys.contains(key) {
1020                            env.retain(|e| e.split('=').next() != Some(key));
1021                        }
1022                        env_keys.insert(key.to_string());
1023                    }
1024                    env.push(entry.clone());
1025                }
1026            }
1027        } else if let Some(ref gpu) = spec.resources.gpu {
1028            // Default to 0..count when no explicit indices are provided
1029            let indices: Vec<String> = (0..gpu.count).map(|i| i.to_string()).collect();
1030            let device_list = indices.join(",");
1031            match gpu.vendor.as_str() {
1032                "nvidia" => {
1033                    env.push(format!("NVIDIA_VISIBLE_DEVICES={device_list}"));
1034                    env.push(format!("CUDA_VISIBLE_DEVICES={device_list}"));
1035                }
1036                "amd" => {
1037                    env.push(format!("ROCR_VISIBLE_DEVICES={device_list}"));
1038                    env.push(format!("HIP_VISIBLE_DEVICES={device_list}"));
1039                }
1040                "intel" => {
1041                    env.push(format!("ZE_AFFINITY_MASK={device_list}"));
1042                }
1043                _ => {}
1044            }
1045        }
1046
1047        // GPU sharing (MPS / time-slicing) env injection.
1048        //
1049        // Layered on top of the CDI / baked-in `*_VISIBLE_DEVICES` block above:
1050        // * MPS: validate host pipe/log dirs exist (error otherwise) and
1051        //   export `CUDA_MPS_PIPE_DIRECTORY` / `CUDA_MPS_LOG_DIRECTORY`.
1052        // * Time-slicing: override `CUDA_VISIBLE_DEVICES` to the configured
1053        //   slice index so the workload sees a single virtualised GPU rather
1054        //   than the full 0..count list emitted above.
1055        //
1056        // The mount side (bind-mounting the MPS dirs / time-slicing config
1057        // file) is handled further down where the rest of the mounts get
1058        // assembled.
1059        let mps_dirs = if let Some(ref gpu) = spec.resources.gpu {
1060            resolve_mps_dirs(gpu)?
1061        } else {
1062            None
1063        };
1064        if let Some(ref dirs) = mps_dirs {
1065            let pipe = format!("CUDA_MPS_PIPE_DIRECTORY={}", dirs.pipe_dir.display());
1066            let log = format!("CUDA_MPS_LOG_DIRECTORY={}", dirs.log_dir.display());
1067            if env_keys.contains("CUDA_MPS_PIPE_DIRECTORY") {
1068                env.retain(|e| e.split('=').next() != Some("CUDA_MPS_PIPE_DIRECTORY"));
1069            }
1070            if env_keys.contains("CUDA_MPS_LOG_DIRECTORY") {
1071                env.retain(|e| e.split('=').next() != Some("CUDA_MPS_LOG_DIRECTORY"));
1072            }
1073            env_keys.insert("CUDA_MPS_PIPE_DIRECTORY".to_string());
1074            env_keys.insert("CUDA_MPS_LOG_DIRECTORY".to_string());
1075            env.push(pipe);
1076            env.push(log);
1077        }
1078        if let Some(ref gpu) = spec.resources.gpu {
1079            if gpu.sharing == Some(GpuSharingMode::TimeSlice) {
1080                if let Some(idx) = gpu.time_slice_index {
1081                    // Time-slicing virtualises a single physical GPU as N
1082                    // slices; the workload sees one device, addressed by
1083                    // its slice index. Override whatever the CDI / baked-in
1084                    // path emitted earlier.
1085                    env.retain(|e| e.split('=').next() != Some("CUDA_VISIBLE_DEVICES"));
1086                    env_keys.insert("CUDA_VISIBLE_DEVICES".to_string());
1087                    env.push(format!("CUDA_VISIBLE_DEVICES={idx}"));
1088                }
1089            }
1090        }
1091
1092        // Inject distributed training coordination env vars when configured.
1093        // MASTER_ADDR uses the service DNS name (resolved by the overlay DNS).
1094        // RANK defaults to 0 (overridden by the agent when placing specific replicas).
1095        if let Some(ref gpu) = spec.resources.gpu {
1096            if let Some(ref dist) = gpu.distributed {
1097                env.push(format!("MASTER_PORT={}", dist.master_port));
1098                env.push(format!("MASTER_ADDR={}", container_id.service));
1099                env.push("WORLD_SIZE=1".to_string());
1100                env.push("RANK=0".to_string());
1101                env.push("LOCAL_RANK=0".to_string());
1102                match dist.backend.as_str() {
1103                    "nccl" => env.push("NCCL_SOCKET_IFNAME=eth0".to_string()),
1104                    "gloo" => env.push("GLOO_SOCKET_IFNAME=eth0".to_string()),
1105                    _ => {}
1106                }
1107            }
1108        }
1109
1110        // Build capabilities
1111        let capabilities = self.build_capabilities(spec)?;
1112
1113        // Determine working directory: builder override > spec.command.workdir > image config > "/"
1114        let cwd = self
1115            .cwd
1116            .clone()
1117            .or_else(|| spec.command.workdir.clone())
1118            .or_else(|| {
1119                self.image_config
1120                    .as_ref()
1121                    .and_then(|c| c.working_dir.as_ref())
1122                    .filter(|w| !w.is_empty())
1123                    .cloned()
1124            })
1125            .unwrap_or_else(|| "/".to_string());
1126
1127        // Resolve process args: builder override > spec command > image config > /bin/sh
1128        let process_args = if let Some(ref args) = self.args {
1129            args.clone()
1130        } else {
1131            Self::resolve_command_from_spec(spec, self.image_config.as_ref())
1132        };
1133
1134        // Build process
1135        let mut process_builder = ProcessBuilder::default()
1136            .terminal(false)
1137            .user(user)
1138            .env(env)
1139            .args(process_args)
1140            .cwd(cwd)
1141            .no_new_privileges(!spec.privileged && spec.capabilities.is_empty());
1142
1143        // Set capabilities if we have them
1144        if let Some(caps) = capabilities {
1145            process_builder = process_builder.capabilities(caps);
1146        }
1147
1148        // Translate `spec.ulimits` (Docker --ulimit style, lowercase keys) into
1149        // OCI `process.rlimits`. Without this libcontainer never calls
1150        // setrlimit and the container inherits the launching daemon's
1151        // defaults — typically nofile=1024, which saturates sharded-storage
1152        // workloads (PlatformStore, etc.) within seconds of boot.
1153        let mut rlimits: Vec<PosixRlimit> = Vec::with_capacity(spec.ulimits.len());
1154        for (name, limit) in &spec.ulimits {
1155            let typ = ulimit_name_to_posix(name).ok_or_else(|| {
1156                AgentError::InvalidSpec(format!(
1157                    "unknown ulimit name `{name}` (expected one of: cpu, fsize, data, stack, \
1158                     core, rss, nproc, nofile, memlock, as, locks, sigpending, msgqueue, nice, \
1159                     rtprio, rttime)"
1160                ))
1161            })?;
1162            let entry = PosixRlimitBuilder::default()
1163                .typ(typ)
1164                .soft(u64::try_from(limit.soft.max(0)).unwrap_or(0))
1165                .hard(u64::try_from(limit.hard.max(0)).unwrap_or(0))
1166                .build()
1167                .map_err(|e| {
1168                    AgentError::InvalidSpec(format!("failed to build rlimit `{name}`: {e}"))
1169                })?;
1170            rlimits.push(entry);
1171        }
1172        if !rlimits.is_empty() {
1173            process_builder = process_builder.rlimits(rlimits);
1174        }
1175
1176        let process = process_builder
1177            .build()
1178            .map_err(|e| AgentError::InvalidSpec(format!("failed to build process: {e}")))?;
1179
1180        // Build root filesystem config
1181        // Note: "rootfs" is relative to the bundle directory per OCI spec
1182        let root = RootBuilder::default()
1183            .path("rootfs".to_string())
1184            .readonly(false)
1185            .build()
1186            .map_err(|e| AgentError::InvalidSpec(format!("failed to build root: {e}")))?;
1187
1188        // Build default mounts
1189        let mut mounts = self.build_default_mounts(spec)?;
1190
1191        // Add storage mounts from spec
1192        let storage_mounts = self.build_storage_mounts(spec, volume_paths)?;
1193        mounts.extend(storage_mounts);
1194
1195        // Add ZLayer API socket bind-mount if configured.
1196        // Use typ("bind") so libcontainer's mount code handles the source path
1197        // correctly for sockets (canonicalize + file-based mount point creation).
1198        if let Some(ref socket_path) = self.socket_path {
1199            mounts.push(
1200                MountBuilder::default()
1201                    .destination(zlayer_paths::ZLayerDirs::default_socket_path())
1202                    .typ("bind")
1203                    .source(socket_path.clone())
1204                    .options(vec!["rbind".into(), "ro".into()])
1205                    .build()
1206                    .expect("valid socket mount"),
1207            );
1208        }
1209
1210        // Container DNS resolver injection.
1211        //
1212        // youki/libcontainer does no resolv.conf handling on its own: the
1213        // container sees whatever `/etc/resolv.conf` the image shipped (often
1214        // empty/absent). When the spec carries explicit resolver addresses
1215        // (`spec.dns`, populated upstream in `ServiceManager` with the overlay
1216        // resolver's node-IP — the host's own resolv.conf is unusable because
1217        // the netbird `~.` systemd-resolved hijack swallows container queries),
1218        // we materialize a minimal resolv.conf alongside the bundle and
1219        // bind-mount it read-only at `/etc/resolv.conf`.
1220        //
1221        // The `resolv.conf` `nameserver` directive has no port syntax (always
1222        // port 53), which is exactly why the overlay DNS server must already be
1223        // bound on `<node_ip>:53` for this address to be useful.
1224        //
1225        // Host-network containers share the host's `/etc/resolv.conf` directly,
1226        // so we skip injection for them (matching the Docker runtime). On the
1227        // WSL2-on-Windows render path `build_spec_only` is called without an
1228        // on-disk bundle directory; the `bundle_dir.exists()` guard skips the
1229        // file write + mount there, preserving today's behavior.
1230        if !spec.host_network && !spec.dns.is_empty() && self.bundle_dir.exists() {
1231            let resolv_path = self.bundle_dir.join("resolv.conf");
1232            let contents = generate_resolv_conf(&spec.dns);
1233            fs::write(&resolv_path, contents).await.map_err(|e| {
1234                AgentError::InvalidSpec(format!(
1235                    "failed to write resolv.conf to bundle at {}: {e}",
1236                    resolv_path.display()
1237                ))
1238            })?;
1239            mounts.push(
1240                MountBuilder::default()
1241                    .destination("/etc/resolv.conf".to_string())
1242                    .typ("bind")
1243                    .source(resolv_path.to_string_lossy().to_string())
1244                    .options(vec!["rbind".to_string(), "ro".to_string()])
1245                    .build()
1246                    .map_err(|e| {
1247                        AgentError::InvalidSpec(format!("failed to build resolv.conf mount: {e}"))
1248                    })?,
1249            );
1250        }
1251
1252        // Append CDI-provided mounts (e.g. vendor driver libraries that the
1253        // GPU runtime needs to expose to the container).
1254        if let Some(ref edits_per_device) = cdi_edits {
1255            for edits in edits_per_device {
1256                for cdi_mount in &edits.mounts {
1257                    let mut opts = cdi_mount.options.clone();
1258                    if !opts.iter().any(|o| o == "bind" || o == "rbind") {
1259                        opts.push("rbind".to_string());
1260                    }
1261                    mounts.push(
1262                        MountBuilder::default()
1263                            .destination(cdi_mount.container_path.clone())
1264                            .typ("bind")
1265                            .source(cdi_mount.host_path.clone())
1266                            .options(opts)
1267                            .build()
1268                            .map_err(|e| {
1269                                AgentError::InvalidSpec(format!("failed to build CDI mount: {e}"))
1270                            })?,
1271                    );
1272                }
1273            }
1274        }
1275
1276        // GPU sharing mounts.
1277        //
1278        // MPS: bind-mount the host pipe / log directories into the container
1279        // at the same path so the in-container CUDA runtime can talk to the
1280        // MPS daemon over its UNIX socket and append to the shared log.
1281        // The env vars (`CUDA_MPS_PIPE_DIRECTORY` / `CUDA_MPS_LOG_DIRECTORY`)
1282        // are exported earlier in the env-assembly block.
1283        //
1284        // Time-slicing: optionally surface the host's slicing config YAML at
1285        // a well-known read-only path so introspection tools inside the
1286        // container can read it.
1287        if let Some(ref dirs) = mps_dirs {
1288            mounts.push(
1289                MountBuilder::default()
1290                    .destination(dirs.pipe_dir.clone())
1291                    .typ("bind")
1292                    .source(dirs.pipe_dir.clone())
1293                    .options(vec!["rbind".into(), "rw".into()])
1294                    .build()
1295                    .map_err(|e| {
1296                        AgentError::InvalidSpec(format!("failed to build MPS pipe mount: {e}"))
1297                    })?,
1298            );
1299            mounts.push(
1300                MountBuilder::default()
1301                    .destination(dirs.log_dir.clone())
1302                    .typ("bind")
1303                    .source(dirs.log_dir.clone())
1304                    .options(vec!["rbind".into(), "rw".into()])
1305                    .build()
1306                    .map_err(|e| {
1307                        AgentError::InvalidSpec(format!("failed to build MPS log mount: {e}"))
1308                    })?,
1309            );
1310        }
1311        if let Some(ref gpu) = spec.resources.gpu {
1312            if gpu.sharing == Some(GpuSharingMode::TimeSlice) {
1313                if let Some(ref cfg_path) = gpu.time_slicing_config_path {
1314                    let host = PathBuf::from(cfg_path);
1315                    if !host.is_file() {
1316                        return Err(AgentError::GpuSharingUnavailable {
1317                            mode: "time-slice".to_string(),
1318                            reason: format!(
1319                                "time-slicing config {} is not a regular file on the host",
1320                                host.display()
1321                            ),
1322                        });
1323                    }
1324                    mounts.push(
1325                        MountBuilder::default()
1326                            .destination(PathBuf::from(TIMESLICE_CONFIG_CONTAINER_PATH))
1327                            .typ("bind")
1328                            .source(host)
1329                            .options(vec!["rbind".into(), "ro".into()])
1330                            .build()
1331                            .map_err(|e| {
1332                                AgentError::InvalidSpec(format!(
1333                                    "failed to build time-slicing config mount: {e}"
1334                                ))
1335                            })?,
1336                    );
1337                }
1338            }
1339        }
1340
1341        // Build Linux-specific config
1342        let linux = self.build_linux_config(container_id, spec, cdi_edits.as_deref())?;
1343
1344        // Determine hostname
1345        let hostname = self
1346            .hostname
1347            .clone()
1348            .unwrap_or_else(|| container_id.to_string());
1349
1350        // Build the complete spec, attaching any CDI-provided hooks.
1351        let mut spec_builder = SpecBuilder::default()
1352            .version("1.0.2".to_string())
1353            .root(root)
1354            .process(process)
1355            .hostname(hostname)
1356            .mounts(mounts)
1357            .linux(linux);
1358
1359        if let Some(ref edits_per_device) = cdi_edits {
1360            if let Some(hooks) = Self::build_hooks_from_cdi(edits_per_device)? {
1361                spec_builder = spec_builder.hooks(hooks);
1362            }
1363        }
1364
1365        let oci_spec = spec_builder
1366            .build()
1367            .map_err(|e| AgentError::InvalidSpec(format!("failed to build OCI spec: {e}")))?;
1368
1369        Ok(oci_spec)
1370    }
1371
1372    /// Convert the union of CDI hooks across all resolved devices into an
1373    /// OCI [`Hooks`] block.
1374    ///
1375    /// Returns `Ok(None)` when no device contributed hooks (so the spec
1376    /// builder skips the empty block — `oci-spec` treats `null` as "no
1377    /// hooks" while serializers may emit empty arrays otherwise).
1378    fn build_hooks_from_cdi(edits_per_device: &[CdiContainerEdits]) -> Result<Option<Hooks>> {
1379        let mut prestart: Vec<Hook> = Vec::new();
1380        let mut create_runtime: Vec<Hook> = Vec::new();
1381        let mut create_container: Vec<Hook> = Vec::new();
1382        let mut start_container: Vec<Hook> = Vec::new();
1383        let mut poststart: Vec<Hook> = Vec::new();
1384        let mut poststop: Vec<Hook> = Vec::new();
1385
1386        for edits in edits_per_device {
1387            let Some(ref h) = edits.hooks else { continue };
1388            for hook in &h.prestart {
1389                prestart.push(convert_cdi_hook(hook)?);
1390            }
1391            for hook in &h.create_runtime {
1392                create_runtime.push(convert_cdi_hook(hook)?);
1393            }
1394            for hook in &h.create_container {
1395                create_container.push(convert_cdi_hook(hook)?);
1396            }
1397            for hook in &h.start_container {
1398                start_container.push(convert_cdi_hook(hook)?);
1399            }
1400            for hook in &h.poststart {
1401                poststart.push(convert_cdi_hook(hook)?);
1402            }
1403            for hook in &h.poststop {
1404                poststop.push(convert_cdi_hook(hook)?);
1405            }
1406        }
1407
1408        if prestart.is_empty()
1409            && create_runtime.is_empty()
1410            && create_container.is_empty()
1411            && start_container.is_empty()
1412            && poststart.is_empty()
1413            && poststop.is_empty()
1414        {
1415            return Ok(None);
1416        }
1417
1418        let mut builder = HooksBuilder::default();
1419        if !prestart.is_empty() {
1420            #[allow(deprecated)]
1421            {
1422                builder = builder.prestart(prestart);
1423            }
1424        }
1425        if !create_runtime.is_empty() {
1426            builder = builder.create_runtime(create_runtime);
1427        }
1428        if !create_container.is_empty() {
1429            builder = builder.create_container(create_container);
1430        }
1431        if !start_container.is_empty() {
1432            builder = builder.start_container(start_container);
1433        }
1434        if !poststart.is_empty() {
1435            builder = builder.poststart(poststart);
1436        }
1437        if !poststop.is_empty() {
1438            builder = builder.poststop(poststop);
1439        }
1440
1441        let hooks = builder
1442            .build()
1443            .map_err(|e| AgentError::InvalidSpec(format!("failed to build CDI hooks: {e}")))?;
1444        Ok(Some(hooks))
1445    }
1446
1447    /// Build Linux capabilities configuration
1448    #[allow(clippy::unused_self)]
1449    fn build_capabilities(
1450        &self,
1451        spec: &ServiceSpec,
1452    ) -> Result<Option<oci_spec::runtime::LinuxCapabilities>> {
1453        if spec.privileged {
1454            // Privileged mode: all capabilities
1455            let all_caps: HashSet<Capability> = ALL_CAPABILITIES.iter().copied().collect();
1456            let empty_caps: HashSet<Capability> = HashSet::new();
1457
1458            let caps = LinuxCapabilitiesBuilder::default()
1459                .bounding(all_caps.clone())
1460                .effective(all_caps.clone())
1461                .permitted(all_caps)
1462                .inheritable(empty_caps.clone())
1463                .ambient(empty_caps)
1464                .build()
1465                .map_err(|e| {
1466                    AgentError::InvalidSpec(format!("failed to build capabilities: {e}"))
1467                })?;
1468
1469            Ok(Some(caps))
1470        } else if !spec.capabilities.is_empty() {
1471            // Specific capabilities requested
1472            let caps: HashSet<Capability> = spec
1473                .capabilities
1474                .iter()
1475                .filter_map(|c| {
1476                    // Normalize capability name (add CAP_ prefix if missing, uppercase)
1477                    let cap_name = if c.starts_with("CAP_") {
1478                        c.to_uppercase()
1479                    } else {
1480                        format!("CAP_{}", c.to_uppercase())
1481                    };
1482                    Capability::from_str(&cap_name).ok()
1483                })
1484                .collect();
1485
1486            let empty_caps: HashSet<Capability> = HashSet::new();
1487
1488            let built_caps = LinuxCapabilitiesBuilder::default()
1489                .bounding(caps.clone())
1490                .effective(caps.clone())
1491                .permitted(caps)
1492                .inheritable(empty_caps.clone())
1493                .ambient(empty_caps)
1494                .build()
1495                .map_err(|e| {
1496                    AgentError::InvalidSpec(format!("failed to build capabilities: {e}"))
1497                })?;
1498
1499            Ok(Some(built_caps))
1500        } else {
1501            // Default: minimal capabilities for basic container operation
1502            let default_caps: HashSet<Capability> = [
1503                Capability::Chown,
1504                Capability::DacOverride,
1505                Capability::Fsetid,
1506                Capability::Fowner,
1507                Capability::Mknod,
1508                Capability::NetRaw,
1509                Capability::Setgid,
1510                Capability::Setuid,
1511                Capability::Setfcap,
1512                Capability::Setpcap,
1513                Capability::NetBindService,
1514                Capability::SysChroot,
1515                Capability::Kill,
1516                Capability::AuditWrite,
1517            ]
1518            .into_iter()
1519            .collect();
1520
1521            let empty_caps: HashSet<Capability> = HashSet::new();
1522
1523            let built_caps = LinuxCapabilitiesBuilder::default()
1524                .bounding(default_caps.clone())
1525                .effective(default_caps.clone())
1526                .permitted(default_caps)
1527                .inheritable(empty_caps.clone())
1528                .ambient(empty_caps)
1529                .build()
1530                .map_err(|e| {
1531                    AgentError::InvalidSpec(format!("failed to build capabilities: {e}"))
1532                })?;
1533
1534            Ok(Some(built_caps))
1535        }
1536    }
1537
1538    /// Build default filesystem mounts for the container
1539    #[allow(clippy::unused_self, clippy::too_many_lines)]
1540    fn build_default_mounts(&self, spec: &ServiceSpec) -> Result<Vec<Mount>> {
1541        let mut mounts = Vec::new();
1542
1543        // /proc
1544        mounts.push(
1545            MountBuilder::default()
1546                .destination("/proc".to_string())
1547                .typ("proc".to_string())
1548                .source("proc".to_string())
1549                .options(vec![
1550                    "nosuid".to_string(),
1551                    "noexec".to_string(),
1552                    "nodev".to_string(),
1553                ])
1554                .build()
1555                .map_err(|e| {
1556                    AgentError::InvalidSpec(format!("failed to build /proc mount: {e}"))
1557                })?,
1558        );
1559
1560        // /dev
1561        mounts.push(
1562            MountBuilder::default()
1563                .destination("/dev".to_string())
1564                .typ("tmpfs".to_string())
1565                .source("tmpfs".to_string())
1566                .options(vec![
1567                    "nosuid".to_string(),
1568                    "strictatime".to_string(),
1569                    "mode=755".to_string(),
1570                    "size=65536k".to_string(),
1571                ])
1572                .build()
1573                .map_err(|e| AgentError::InvalidSpec(format!("failed to build /dev mount: {e}")))?,
1574        );
1575
1576        // /dev/pts
1577        mounts.push(
1578            MountBuilder::default()
1579                .destination("/dev/pts".to_string())
1580                .typ("devpts".to_string())
1581                .source("devpts".to_string())
1582                .options(vec![
1583                    "nosuid".to_string(),
1584                    "noexec".to_string(),
1585                    "newinstance".to_string(),
1586                    "ptmxmode=0666".to_string(),
1587                    "mode=0620".to_string(),
1588                    "gid=5".to_string(),
1589                ])
1590                .build()
1591                .map_err(|e| {
1592                    AgentError::InvalidSpec(format!("failed to build /dev/pts mount: {e}"))
1593                })?,
1594        );
1595
1596        // /dev/shm
1597        mounts.push(
1598            MountBuilder::default()
1599                .destination("/dev/shm".to_string())
1600                .typ("tmpfs".to_string())
1601                .source("shm".to_string())
1602                .options(vec![
1603                    "nosuid".to_string(),
1604                    "noexec".to_string(),
1605                    "nodev".to_string(),
1606                    "mode=1777".to_string(),
1607                    "size=65536k".to_string(),
1608                ])
1609                .build()
1610                .map_err(|e| {
1611                    AgentError::InvalidSpec(format!("failed to build /dev/shm mount: {e}"))
1612                })?,
1613        );
1614
1615        // /dev/mqueue
1616        mounts.push(
1617            MountBuilder::default()
1618                .destination("/dev/mqueue".to_string())
1619                .typ("mqueue".to_string())
1620                .source("mqueue".to_string())
1621                .options(vec![
1622                    "nosuid".to_string(),
1623                    "noexec".to_string(),
1624                    "nodev".to_string(),
1625                ])
1626                .build()
1627                .map_err(|e| {
1628                    AgentError::InvalidSpec(format!("failed to build /dev/mqueue mount: {e}"))
1629                })?,
1630        );
1631
1632        // /sys - read-only unless privileged
1633        let sys_options = if spec.privileged {
1634            vec![
1635                "nosuid".to_string(),
1636                "noexec".to_string(),
1637                "nodev".to_string(),
1638            ]
1639        } else {
1640            vec![
1641                "nosuid".to_string(),
1642                "noexec".to_string(),
1643                "nodev".to_string(),
1644                "ro".to_string(),
1645            ]
1646        };
1647
1648        mounts.push(
1649            MountBuilder::default()
1650                .destination("/sys".to_string())
1651                .typ("sysfs".to_string())
1652                .source("sysfs".to_string())
1653                .options(sys_options)
1654                .build()
1655                .map_err(|e| AgentError::InvalidSpec(format!("failed to build /sys mount: {e}")))?,
1656        );
1657
1658        // /sys/fs/cgroup - for cgroup access
1659        mounts.push(
1660            MountBuilder::default()
1661                .destination("/sys/fs/cgroup".to_string())
1662                .typ("cgroup2".to_string())
1663                .source("cgroup".to_string())
1664                .options(vec![
1665                    "nosuid".to_string(),
1666                    "noexec".to_string(),
1667                    "nodev".to_string(),
1668                    "relatime".to_string(),
1669                ])
1670                .build()
1671                .map_err(|e| {
1672                    AgentError::InvalidSpec(format!("failed to build cgroup mount: {e}"))
1673                })?,
1674        );
1675
1676        Ok(mounts)
1677    }
1678
1679    /// Build storage mounts from `ServiceSpec` storage entries
1680    ///
1681    /// Converts `StorageSpec` entries to OCI Mount entries.
1682    /// Note: Named and Anonymous volumes require `StorageManager` to prepare paths.
1683    /// S3 volumes require s3fs FUSE mount (handled separately).
1684    #[allow(clippy::unused_self, clippy::too_many_lines)]
1685    fn build_storage_mounts(
1686        &self,
1687        spec: &ServiceSpec,
1688        volume_paths: &std::collections::HashMap<String, PathBuf>,
1689    ) -> Result<Vec<Mount>> {
1690        let mut mounts = Vec::new();
1691
1692        for storage in &spec.storage {
1693            let mount = match storage {
1694                StorageSpec::Bind {
1695                    source,
1696                    target,
1697                    readonly,
1698                } => {
1699                    let mut options = vec!["rbind".to_string()];
1700                    if *readonly {
1701                        options.push("ro".to_string());
1702                    } else {
1703                        options.push("rw".to_string());
1704                    }
1705
1706                    MountBuilder::default()
1707                        .destination(target.clone())
1708                        .typ("none".to_string())
1709                        .source(source.clone())
1710                        .options(options)
1711                        .build()
1712                        .map_err(|e| {
1713                            AgentError::InvalidSpec(format!(
1714                                "failed to build bind mount for {target}: {e}"
1715                            ))
1716                        })?
1717                }
1718
1719                StorageSpec::Named {
1720                    name,
1721                    target,
1722                    readonly,
1723                    tier,
1724                    ..
1725                } => {
1726                    // Get the prepared volume path from StorageManager
1727                    let source = volume_paths.get(name).ok_or_else(|| {
1728                        AgentError::InvalidSpec(format!(
1729                            "volume '{name}' not prepared - ensure StorageManager.ensure_volume() was called"
1730                        ))
1731                    })?;
1732
1733                    // Warn about SQLite safety for non-local tiers
1734                    if matches!(tier, StorageTier::Network) {
1735                        tracing::warn!(
1736                            volume = %name,
1737                            tier = ?tier,
1738                            "Network storage tier is NOT SQLite-safe. Avoid using SQLite databases on this volume."
1739                        );
1740                    }
1741
1742                    let mut options = vec!["rbind".to_string()];
1743                    if *readonly {
1744                        options.push("ro".to_string());
1745                    } else {
1746                        options.push("rw".to_string());
1747                    }
1748
1749                    MountBuilder::default()
1750                        .destination(target.clone())
1751                        .typ("none".to_string())
1752                        .source(source.to_string_lossy().to_string())
1753                        .options(options)
1754                        .build()
1755                        .map_err(|e| {
1756                            AgentError::InvalidSpec(format!(
1757                                "failed to build named volume mount for {target}: {e}"
1758                            ))
1759                        })?
1760                }
1761
1762                StorageSpec::Anonymous { target, tier } => {
1763                    // Anonymous volumes should have been created by StorageManager
1764                    // and the path passed in volume_paths with key "_anon_{target}"
1765                    let key = format!("_anon_{}", target.trim_start_matches('/').replace('/', "_"));
1766                    let source = volume_paths.get(&key).ok_or_else(|| {
1767                        AgentError::InvalidSpec(format!(
1768                            "anonymous volume for '{target}' not prepared"
1769                        ))
1770                    })?;
1771
1772                    if matches!(tier, StorageTier::Network) {
1773                        tracing::warn!(
1774                            target = %target,
1775                            tier = ?tier,
1776                            "Network storage tier is NOT SQLite-safe."
1777                        );
1778                    }
1779
1780                    let options = vec!["rbind".to_string(), "rw".to_string()];
1781
1782                    MountBuilder::default()
1783                        .destination(target.clone())
1784                        .typ("none".to_string())
1785                        .source(source.to_string_lossy().to_string())
1786                        .options(options)
1787                        .build()
1788                        .map_err(|e| {
1789                            AgentError::InvalidSpec(format!(
1790                                "failed to build anonymous volume mount for {target}: {e}"
1791                            ))
1792                        })?
1793                }
1794
1795                StorageSpec::Tmpfs { target, size, mode } => {
1796                    let mut options = vec!["nosuid".to_string(), "nodev".to_string()];
1797
1798                    if let Some(size_str) = size {
1799                        options.push(format!("size={size_str}"));
1800                    }
1801
1802                    if let Some(mode_val) = mode {
1803                        options.push(format!("mode={mode_val:o}"));
1804                    }
1805
1806                    MountBuilder::default()
1807                        .destination(target.clone())
1808                        .typ("tmpfs".to_string())
1809                        .source("tmpfs".to_string())
1810                        .options(options)
1811                        .build()
1812                        .map_err(|e| {
1813                            AgentError::InvalidSpec(format!(
1814                                "failed to build tmpfs mount for {target}: {e}"
1815                            ))
1816                        })?
1817                }
1818
1819                StorageSpec::S3 {
1820                    bucket,
1821                    prefix,
1822                    target,
1823                    readonly,
1824                    endpoint: _,
1825                    credentials: _,
1826                } => {
1827                    // S3 mounts are handled via s3fs FUSE
1828                    // The StorageManager should have mounted the bucket and passed the path
1829                    let key = format!("_s3_{}_{}", bucket, prefix.as_deref().unwrap_or(""));
1830                    let source = volume_paths.get(&key).ok_or_else(|| {
1831                        AgentError::InvalidSpec(format!(
1832                            "S3 volume for bucket '{bucket}' not mounted - ensure StorageManager.mount_s3() was called"
1833                        ))
1834                    })?;
1835
1836                    tracing::warn!(
1837                        bucket = %bucket,
1838                        target = %target,
1839                        "S3 storage is NOT SQLite-safe. Use for read-heavy workloads only."
1840                    );
1841
1842                    let mut options = vec!["rbind".to_string()];
1843                    if *readonly {
1844                        options.push("ro".to_string());
1845                    } else {
1846                        options.push("rw".to_string());
1847                    }
1848
1849                    MountBuilder::default()
1850                        .destination(target.clone())
1851                        .typ("none".to_string())
1852                        .source(source.to_string_lossy().to_string())
1853                        .options(options)
1854                        .build()
1855                        .map_err(|e| {
1856                            AgentError::InvalidSpec(format!(
1857                                "failed to build S3 mount for {target}: {e}"
1858                            ))
1859                        })?
1860                }
1861            };
1862
1863            mounts.push(mount);
1864        }
1865
1866        Ok(mounts)
1867    }
1868
1869    /// Build Linux-specific configuration
1870    #[allow(clippy::similar_names)] // euid/egid are POSIX-standard paired names
1871    #[allow(clippy::too_many_lines)]
1872    fn build_linux_config(
1873        &self,
1874        container_id: &ContainerId,
1875        spec: &ServiceSpec,
1876        cdi_edits: Option<&[CdiContainerEdits]>,
1877    ) -> Result<oci_spec::runtime::Linux> {
1878        // Build namespaces
1879        let mut namespaces = vec![
1880            LinuxNamespaceBuilder::default()
1881                .typ(LinuxNamespaceType::Pid)
1882                .build()
1883                .unwrap(),
1884            LinuxNamespaceBuilder::default()
1885                .typ(LinuxNamespaceType::Ipc)
1886                .build()
1887                .unwrap(),
1888            LinuxNamespaceBuilder::default()
1889                .typ(LinuxNamespaceType::Uts)
1890                .build()
1891                .unwrap(),
1892            LinuxNamespaceBuilder::default()
1893                .typ(LinuxNamespaceType::Mount)
1894                .build()
1895                .unwrap(),
1896        ];
1897
1898        // Only add Network namespace when NOT using host networking.
1899        // In host networking mode, the container shares the host's network stack
1900        // (like Docker's --network host).
1901        if !self.host_network {
1902            namespaces.push(
1903                LinuxNamespaceBuilder::default()
1904                    .typ(LinuxNamespaceType::Network)
1905                    .build()
1906                    .unwrap(),
1907            );
1908        }
1909
1910        // `nix::unistd` is unix-only. On non-unix targets (Windows), libcontainer
1911        // is not the runtime path (HCS is) and this function is effectively dead
1912        // code — so we statically force `rootless = false` there and skip the
1913        // user-namespace mapping block entirely.
1914        #[cfg(unix)]
1915        let rootless = !nix::unistd::geteuid().is_root();
1916        #[cfg(not(unix))]
1917        let rootless = false;
1918
1919        if rootless {
1920            namespaces.push(
1921                LinuxNamespaceBuilder::default()
1922                    .typ(LinuxNamespaceType::User)
1923                    .build()
1924                    .unwrap(),
1925            );
1926            namespaces.push(
1927                LinuxNamespaceBuilder::default()
1928                    .typ(LinuxNamespaceType::Cgroup)
1929                    .build()
1930                    .unwrap(),
1931            );
1932        }
1933
1934        let mut linux_builder = LinuxBuilder::default().namespaces(namespaces);
1935
1936        #[cfg(unix)]
1937        if rootless {
1938            let euid = nix::unistd::geteuid();
1939            let egid = nix::unistd::getegid();
1940            let username = nix::unistd::User::from_uid(euid)
1941                .ok()
1942                .flatten()
1943                .map(|u| u.name)
1944                .unwrap_or_default();
1945            linux_builder = linux_builder
1946                .uid_mappings(build_rootless_id_mappings(
1947                    euid.as_raw(),
1948                    "/etc/subuid",
1949                    &username,
1950                ))
1951                .gid_mappings(build_rootless_id_mappings(
1952                    egid.as_raw(),
1953                    "/etc/subgid",
1954                    &username,
1955                ));
1956        }
1957
1958        // Build resources (CPU, memory, devices)
1959        let resources = self.build_resources(spec)?;
1960        if let Some(resources) = resources {
1961            linux_builder = linux_builder.resources(resources);
1962        }
1963
1964        // Build device entries for passthrough.
1965        //
1966        // When CDI edits are present, the vendor-supplied device-node list
1967        // replaces our baked-in vendor-specific defaults — CDI knows the
1968        // host's exact device geometry (which majors/minors map to which
1969        // GPUs) so we trust it over our static `/dev/nvidiaN` enumeration.
1970        let mut devices = self.build_devices(spec, None, cdi_edits.is_some())?;
1971        if let Some(edits_per_device) = cdi_edits {
1972            for edits in edits_per_device {
1973                for node in &edits.device_nodes {
1974                    devices.push(cdi_node_to_oci_device(node)?);
1975                }
1976            }
1977        }
1978        if !devices.is_empty() {
1979            linux_builder = linux_builder.devices(devices);
1980        }
1981
1982        // Set rootfs propagation (matches Docker default)
1983        linux_builder = linux_builder.rootfs_propagation("private".to_string());
1984
1985        // Set masked/readonly paths based on privileged mode
1986        if spec.privileged {
1987            // Privileged containers get no masked paths (full access)
1988            linux_builder = linux_builder.masked_paths(vec![]).readonly_paths(vec![]);
1989        } else {
1990            // Set masked paths for security (hide sensitive host info)
1991            let masked_paths = vec![
1992                "/proc/acpi".to_string(),
1993                "/proc/asound".to_string(),
1994                "/proc/kcore".to_string(),
1995                "/proc/keys".to_string(),
1996                "/proc/latency_stats".to_string(),
1997                "/proc/timer_list".to_string(),
1998                "/proc/timer_stats".to_string(),
1999                "/proc/sched_debug".to_string(),
2000                "/proc/scsi".to_string(),
2001                "/sys/firmware".to_string(),
2002            ];
2003
2004            // Set readonly paths for security
2005            let readonly_paths = vec![
2006                "/proc/bus".to_string(),
2007                "/proc/fs".to_string(),
2008                "/proc/irq".to_string(),
2009                "/proc/sys".to_string(),
2010                "/proc/sysrq-trigger".to_string(),
2011            ];
2012
2013            linux_builder = linux_builder
2014                .masked_paths(masked_paths)
2015                .readonly_paths(readonly_paths);
2016        }
2017
2018        // Determine cgroups_path so libcontainer creates the container cgroup
2019        // under the current process's cgroup rather than at the v2 root. This
2020        // is required when running inside another container (e.g. Forgejo CI
2021        // `container:` block) where `/sys/fs/cgroup/cgroup.subtree_control` is
2022        // read-only. Precedence:
2023        //   1. spec.cgroup_parent (per-service override)         — all platforms
2024        //   2. ZLAYER_CGROUP_PARENT env var (host-wide override) — all platforms
2025        //   3. /proc/self/cgroup (auto-detect when nested)       — Linux only
2026        //   4. unset (default — bare-metal happy path; also the WSL2-delegate
2027        //      case on non-Linux hosts, where libcontainer inside the WSL
2028        //      distro resolves the parent at `zlayer runtime create` time)
2029        let cid = container_id.to_string();
2030
2031        // Explicit overrides are honored on every platform: a user might pin a
2032        // cgroup_parent for a WSL-delegate-bound spec even when this process
2033        // is running on Windows.
2034        let explicit_parent: Option<(String, &'static str)> =
2035            if let Some(p) = spec.cgroup_parent.as_deref().filter(|s| !s.is_empty()) {
2036                Some((p.to_string(), "spec"))
2037            } else if let Some(p) = std::env::var("ZLAYER_CGROUP_PARENT")
2038                .ok()
2039                .filter(|s| !s.is_empty())
2040            {
2041                Some((p, "env"))
2042            } else {
2043                None
2044            };
2045
2046        // Auto-detect (and the "no writable parent" hard error below) are
2047        // Linux-only: they inspect /proc/self/cgroup and /sys/fs/cgroup, which
2048        // don't exist on Windows hosts. When the bundle is destined for the
2049        // WSL2 delegate, cgroup-parent resolution happens inside the distro
2050        // at `zlayer runtime create` time, not here on the host.
2051        #[cfg(target_os = "linux")]
2052        let auto_parent: Option<(String, &'static str)> =
2053            if let Some(p) = crate::capability::ensure_daemon_leaf_and_container_parent() {
2054                Some((p, "auto-init"))
2055            } else if let Some(p) = crate::capability::current_cgroup_v2_path() {
2056                // Fallback: migration failed (likely cgroup root is read-only); use the
2057                // raw scope path. Pre-fix behaviour — surfaces the original error.
2058                Some((p, "auto"))
2059            } else {
2060                None
2061            };
2062        #[cfg(not(target_os = "linux"))]
2063        let auto_parent: Option<(String, &'static str)> = None;
2064
2065        let (cgroup_parent_value, cgroup_parent_source): (Option<String>, &'static str) =
2066            explicit_parent
2067                .or(auto_parent)
2068                .map_or((None, "none"), |(p, s)| (Some(p), s));
2069
2070        // Diagnostic guard rail: capability survey says we're nested, but we
2071        // couldn't resolve a cgroup parent here. This combination should not
2072        // normally happen because both code paths consult the same
2073        // `current_cgroup_v2_path()` helper. Surface it so an operator can
2074        // investigate; do not fail container creation. Linux-only — the
2075        // capability survey is itself a no-op on non-Linux.
2076        #[cfg(target_os = "linux")]
2077        if cgroup_parent_value.is_none() && crate::capability::DaemonCapabilities::get().is_nested {
2078            tracing::warn!(
2079                container_id = %cid,
2080                "capability survey reports nested daemon but cgroup_parent could not be resolved — proceeding with v2 root"
2081            );
2082        }
2083
2084        if let Some(parent) = cgroup_parent_value {
2085            let parent = parent.trim_end_matches('/');
2086            let full = format!("{parent}/{cid}");
2087            match cgroup_parent_source {
2088                "spec" => tracing::info!(
2089                    container_id = %cid,
2090                    source = "spec",
2091                    path = %full,
2092                    "cgroup_parent selected"
2093                ),
2094                "env" => tracing::info!(
2095                    container_id = %cid,
2096                    source = "env",
2097                    path = %full,
2098                    "cgroup_parent selected"
2099                ),
2100                "auto" => tracing::info!(
2101                    container_id = %cid,
2102                    source = "auto",
2103                    path = %full,
2104                    "cgroup_parent selected (from /proc/self/cgroup)"
2105                ),
2106                "auto-init" => tracing::info!(
2107                    container_id = %cid,
2108                    source = "auto-init",
2109                    path = %full,
2110                    "cgroup_parent selected (migrated daemon to <scope>/init; containers go under <scope>/containers)"
2111                ),
2112                _ => unreachable!(),
2113            }
2114            linux_builder = linux_builder.cgroups_path(std::path::PathBuf::from(full));
2115        } else {
2116            // Auto-detect found nothing AND no explicit override. Behaviour
2117            // differs by platform:
2118            //   - Linux: this is a real error in nested-container envs where
2119            //     the cgroup root is read-only. Emit the hard error so an
2120            //     operator fixes the env.
2121            //   - Non-Linux (Windows host building a bundle for the WSL2
2122            //     delegate): expected path; cgroup setup happens inside the
2123            //     distro at runtime-create time.
2124            #[cfg(target_os = "linux")]
2125            {
2126                let caps = crate::capability::DaemonCapabilities::get();
2127                if !caps.can_write_cgroup_root {
2128                    return Err(AgentError::InvalidSpec(format!(
2129                        "cannot create container {cid}: no writable cgroup parent. \
2130                         /proc/self/cgroup reports the cgroup-v2 root, and \
2131                         /sys/fs/cgroup is read-only to this process. Fix one of: \
2132                         (a) run the daemon's outer container with --cgroupns=host \
2133                         so /proc/self/cgroup reports a real parent; \
2134                         (b) set ZLAYER_CGROUP_PARENT=/path/to/writable/cgroup; \
2135                         (c) grant the daemon write access to /sys/fs/cgroup."
2136                    )));
2137                }
2138                tracing::info!(
2139                    container_id = %cid,
2140                    "cgroup_parent unset — libcontainer will use v2 root (cgroup root is writable here)"
2141                );
2142            }
2143            #[cfg(not(target_os = "linux"))]
2144            tracing::debug!(
2145                container_id = %cid,
2146                "non-Linux host — cgroup_parent unset; libcontainer inside the WSL distro will resolve a parent from its cgroup-v2 root"
2147            );
2148        }
2149
2150        linux_builder
2151            .build()
2152            .map_err(|e| AgentError::InvalidSpec(format!("failed to build linux config: {e}")))
2153    }
2154
2155    /// Build resource limits (CPU, memory, device cgroups)
2156    #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
2157    fn build_resources(
2158        &self,
2159        spec: &ServiceSpec,
2160    ) -> Result<Option<oci_spec::runtime::LinuxResources>> {
2161        let mut resources_builder = LinuxResourcesBuilder::default();
2162        let mut has_resources = false;
2163
2164        // CPU limits
2165        if let Some(cpu_limit) = spec.resources.cpu {
2166            // Convert CPU cores to microseconds quota
2167            // 100000 microseconds = 1 core's worth of time per period
2168            let quota = (cpu_limit * 100_000.0) as i64;
2169            let cpu = LinuxCpuBuilder::default()
2170                .quota(quota)
2171                .period(100_000u64)
2172                .build()
2173                .map_err(|e| AgentError::InvalidSpec(format!("failed to build CPU limits: {e}")))?;
2174
2175            resources_builder = resources_builder.cpu(cpu);
2176            has_resources = true;
2177        }
2178
2179        // Memory limits
2180        if let Some(ref memory_str) = spec.resources.memory {
2181            let bytes = parse_memory_string(memory_str)
2182                .map_err(|e| AgentError::InvalidSpec(format!("invalid memory limit: {e}")))?;
2183
2184            let memory = LinuxMemoryBuilder::default()
2185                .limit(bytes as i64)
2186                .build()
2187                .map_err(|e| {
2188                    AgentError::InvalidSpec(format!("failed to build memory limits: {e}"))
2189                })?;
2190
2191            resources_builder = resources_builder.memory(memory);
2192            has_resources = true;
2193        }
2194
2195        // Device cgroup rules
2196        let device_rules = self.build_device_cgroup_rules(spec, None)?;
2197        if !device_rules.is_empty() {
2198            resources_builder = resources_builder.devices(device_rules);
2199            has_resources = true;
2200        }
2201
2202        if has_resources {
2203            let resources = resources_builder
2204                .build()
2205                .map_err(|e| AgentError::InvalidSpec(format!("failed to build resources: {e}")))?;
2206            Ok(Some(resources))
2207        } else {
2208            Ok(None)
2209        }
2210    }
2211
2212    /// Build device cgroup rules
2213    #[allow(clippy::unused_self, clippy::too_many_lines)]
2214    fn build_device_cgroup_rules(
2215        &self,
2216        spec: &ServiceSpec,
2217        _gpu_indices: Option<&[u32]>,
2218    ) -> Result<Vec<oci_spec::runtime::LinuxDeviceCgroup>> {
2219        let mut rules = Vec::new();
2220
2221        if spec.privileged {
2222            // Privileged mode: allow all devices
2223            let rule = LinuxDeviceCgroupBuilder::default()
2224                .allow(true)
2225                .access("rwm".to_string())
2226                .build()
2227                .map_err(|e| {
2228                    AgentError::InvalidSpec(format!("failed to build device cgroup rule: {e}"))
2229                })?;
2230            rules.push(rule);
2231        } else {
2232            // Default: deny all, then allow specific devices
2233            let deny_all = LinuxDeviceCgroupBuilder::default()
2234                .allow(false)
2235                .access("rwm".to_string())
2236                .build()
2237                .map_err(|e| AgentError::InvalidSpec(format!("failed to build deny rule: {e}")))?;
2238            rules.push(deny_all);
2239
2240            // Allow standard container devices
2241            // /dev/null, /dev/zero, /dev/full, /dev/random, /dev/urandom, /dev/tty
2242            let standard_char_devices = [
2243                (1, 3, "rwm"),    // /dev/null
2244                (1, 5, "rwm"),    // /dev/zero
2245                (1, 7, "rwm"),    // /dev/full
2246                (1, 8, "rwm"),    // /dev/random
2247                (1, 9, "rwm"),    // /dev/urandom
2248                (5, 0, "rwm"),    // /dev/tty
2249                (5, 1, "rwm"),    // /dev/console
2250                (5, 2, "rwm"),    // /dev/ptmx
2251                (136, -1, "rwm"), // /dev/pts/* (wildcard minor)
2252            ];
2253
2254            for (major, minor, access) in standard_char_devices {
2255                let mut builder = LinuxDeviceCgroupBuilder::default()
2256                    .allow(true)
2257                    .typ(LinuxDeviceType::C)
2258                    .major(i64::from(major))
2259                    .access(access.to_string());
2260
2261                if minor >= 0 {
2262                    builder = builder.minor(i64::from(minor));
2263                }
2264
2265                let rule = builder.build().map_err(|e| {
2266                    AgentError::InvalidSpec(format!("failed to build char device rule: {e}"))
2267                })?;
2268                rules.push(rule);
2269            }
2270
2271            // Allow specific devices from spec (Unix-only: requires /dev/* fs
2272            // probing via `MetadataExt::rdev`). On Windows the WSL2 delegate
2273            // path regenerates these inside the Linux distro, so we skip here.
2274            #[cfg(unix)]
2275            for device in &spec.devices {
2276                if let Ok((major, minor)) = get_device_major_minor(&device.path) {
2277                    let dev_type = get_device_type(&device.path).unwrap_or(LinuxDeviceType::C);
2278
2279                    // Build access string
2280                    let mut access = String::new();
2281                    if device.read {
2282                        access.push('r');
2283                    }
2284                    if device.write {
2285                        access.push('w');
2286                    }
2287                    if device.mknod {
2288                        access.push('m');
2289                    }
2290                    if access.is_empty() {
2291                        access = "rw".to_string();
2292                    }
2293
2294                    let rule = LinuxDeviceCgroupBuilder::default()
2295                        .allow(true)
2296                        .typ(dev_type)
2297                        .major(major)
2298                        .minor(minor)
2299                        .access(access)
2300                        .build()
2301                        .map_err(|e| {
2302                            AgentError::InvalidSpec(format!(
2303                                "failed to build device rule for {}: {}",
2304                                device.path, e
2305                            ))
2306                        })?;
2307                    rules.push(rule);
2308                } else {
2309                    tracing::warn!("Failed to get device info for {}, skipping", device.path);
2310                }
2311            }
2312
2313            // Auto-allow GPU devices in cgroup when gpu spec is set
2314            if let Some(ref gpu) = spec.resources.gpu {
2315                match gpu.vendor.as_str() {
2316                    "nvidia" => {
2317                        // Allow all nvidia devices (major 195 for nvidia GPUs)
2318                        let rule = LinuxDeviceCgroupBuilder::default()
2319                            .allow(true)
2320                            .typ(LinuxDeviceType::C)
2321                            .major(195i64)
2322                            .access("rwm".to_string())
2323                            .build()
2324                            .map_err(|e| {
2325                                AgentError::InvalidSpec(format!(
2326                                    "failed to build GPU cgroup rule: {e}"
2327                                ))
2328                            })?;
2329                        rules.push(rule);
2330
2331                        // nvidia-uvm (major 510 or check dynamically)
2332                        let uvm_rule = LinuxDeviceCgroupBuilder::default()
2333                            .allow(true)
2334                            .typ(LinuxDeviceType::C)
2335                            .major(510i64)
2336                            .access("rwm".to_string())
2337                            .build()
2338                            .map_err(|e| {
2339                                AgentError::InvalidSpec(format!(
2340                                    "failed to build GPU UVM cgroup rule: {e}"
2341                                ))
2342                            })?;
2343                        rules.push(uvm_rule);
2344                    }
2345                    "amd" => {
2346                        // AMD ROCm: /dev/dri/renderD* and /dev/dri/card* (major 226)
2347                        let dri_rule = LinuxDeviceCgroupBuilder::default()
2348                            .allow(true)
2349                            .typ(LinuxDeviceType::C)
2350                            .major(226i64)
2351                            .access("rwm".to_string())
2352                            .build()
2353                            .map_err(|e| {
2354                                AgentError::InvalidSpec(format!(
2355                                    "failed to build AMD DRI cgroup rule: {e}"
2356                                ))
2357                            })?;
2358                        rules.push(dri_rule);
2359
2360                        // /dev/kfd - AMD Kernel Fusion Driver for compute (major 234)
2361                        let kfd_rule = LinuxDeviceCgroupBuilder::default()
2362                            .allow(true)
2363                            .typ(LinuxDeviceType::C)
2364                            .major(234i64)
2365                            .access("rwm".to_string())
2366                            .build()
2367                            .map_err(|e| {
2368                                AgentError::InvalidSpec(format!(
2369                                    "failed to build AMD KFD cgroup rule: {e}"
2370                                ))
2371                            })?;
2372                        rules.push(kfd_rule);
2373                    }
2374                    "intel" => {
2375                        // Intel GPU: /dev/dri/renderD* and /dev/dri/card* (major 226)
2376                        let dri_rule = LinuxDeviceCgroupBuilder::default()
2377                            .allow(true)
2378                            .typ(LinuxDeviceType::C)
2379                            .major(226i64)
2380                            .access("rwm".to_string())
2381                            .build()
2382                            .map_err(|e| {
2383                                AgentError::InvalidSpec(format!(
2384                                    "failed to build Intel DRI cgroup rule: {e}"
2385                                ))
2386                            })?;
2387                        rules.push(dri_rule);
2388                    }
2389                    other => {
2390                        // Unknown vendor - allow DRI devices as a reasonable default
2391                        tracing::warn!(
2392                            vendor = %other,
2393                            "Unknown GPU vendor, allowing DRI devices (major 226)"
2394                        );
2395                        let dri_rule = LinuxDeviceCgroupBuilder::default()
2396                            .allow(true)
2397                            .typ(LinuxDeviceType::C)
2398                            .major(226i64)
2399                            .access("rwm".to_string())
2400                            .build()
2401                            .map_err(|e| {
2402                                AgentError::InvalidSpec(format!(
2403                                    "failed to build GPU DRI cgroup rule: {e}"
2404                                ))
2405                            })?;
2406                        rules.push(dri_rule);
2407                    }
2408                }
2409            }
2410        }
2411
2412        Ok(rules)
2413    }
2414
2415    /// Build Linux device entries for passthrough
2416    ///
2417    /// # Platform
2418    /// Every branch below walks `/dev/*` on the host to resolve major/minor
2419    /// numbers via `MetadataExt::rdev`. On Windows (where this module is
2420    /// compiled only to feed the WSL2 delegate's cross-platform spec path) we
2421    /// skip device discovery and return an empty list — the Linux side of the
2422    /// delegate re-runs this step inside the WSL2 distro.
2423    #[allow(clippy::unused_self, clippy::too_many_lines)]
2424    #[cfg_attr(not(unix), allow(clippy::unnecessary_wraps, clippy::needless_return))]
2425    fn build_devices(
2426        &self,
2427        spec: &ServiceSpec,
2428        gpu_indices: Option<&[u32]>,
2429        skip_gpu_defaults: bool,
2430    ) -> Result<Vec<oci_spec::runtime::LinuxDevice>> {
2431        #[cfg(not(unix))]
2432        {
2433            let _ = (spec, gpu_indices, skip_gpu_defaults);
2434            return Ok(Vec::new());
2435        }
2436
2437        #[cfg(unix)]
2438        {
2439            let mut devices = Vec::new();
2440
2441            for device in &spec.devices {
2442                if let Ok((major, minor)) = get_device_major_minor(&device.path) {
2443                    let dev_type = get_device_type(&device.path).unwrap_or(LinuxDeviceType::C);
2444
2445                    let linux_device = LinuxDeviceBuilder::default()
2446                        .path(device.path.clone())
2447                        .typ(dev_type)
2448                        .major(major)
2449                        .minor(minor)
2450                        .file_mode(0o666u32)
2451                        .uid(0u32)
2452                        .gid(0u32)
2453                        .build()
2454                        .map_err(|e| {
2455                            AgentError::InvalidSpec(format!(
2456                                "failed to build device {}: {}",
2457                                device.path, e
2458                            ))
2459                        })?;
2460
2461                    devices.push(linux_device);
2462                }
2463            }
2464
2465            // When CDI is providing GPU device descriptors the caller will
2466            // append the vendor-supplied entries; skip our hard-coded
2467            // `/dev/nvidiaN` enumeration so we don't end up with both sources
2468            // of truth.
2469            if skip_gpu_defaults {
2470                return Ok(devices);
2471            }
2472
2473            // Auto-inject GPU devices when gpu spec is set
2474            if let Some(ref gpu) = spec.resources.gpu {
2475                let indices: Vec<u32> =
2476                    gpu_indices.map_or_else(|| (0..gpu.count).collect(), <[u32]>::to_vec);
2477
2478                match gpu.vendor.as_str() {
2479                    "nvidia" => {
2480                        // Always needed: nvidiactl, nvidia-uvm, nvidia-uvm-tools
2481                        let always_devices =
2482                            ["/dev/nvidiactl", "/dev/nvidia-uvm", "/dev/nvidia-uvm-tools"];
2483                        for dev_path in &always_devices {
2484                            if let Ok((major, minor)) = get_device_major_minor(dev_path) {
2485                                let dev_type =
2486                                    get_device_type(dev_path).unwrap_or(LinuxDeviceType::C);
2487                                let linux_device = LinuxDeviceBuilder::default()
2488                                    .path((*dev_path).to_string())
2489                                    .typ(dev_type)
2490                                    .major(major)
2491                                    .minor(minor)
2492                                    .file_mode(0o666u32)
2493                                    .uid(0u32)
2494                                    .gid(0u32)
2495                                    .build()
2496                                    .map_err(|e| {
2497                                        AgentError::InvalidSpec(format!(
2498                                            "failed to build GPU device {dev_path}: {e}"
2499                                        ))
2500                                    })?;
2501                                devices.push(linux_device);
2502                            } else {
2503                                tracing::warn!(
2504                                    "GPU device {} not found on host, skipping",
2505                                    dev_path
2506                                );
2507                            }
2508                        }
2509
2510                        // Per-GPU devices: /dev/nvidia0, /dev/nvidia1, etc.
2511                        for i in &indices {
2512                            let dev_path = format!("/dev/nvidia{i}");
2513                            if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2514                                let dev_type =
2515                                    get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2516                                let linux_device = LinuxDeviceBuilder::default()
2517                                    .path(dev_path.clone())
2518                                    .typ(dev_type)
2519                                    .major(major)
2520                                    .minor(minor)
2521                                    .file_mode(0o666u32)
2522                                    .uid(0u32)
2523                                    .gid(0u32)
2524                                    .build()
2525                                    .map_err(|e| {
2526                                        AgentError::InvalidSpec(format!(
2527                                            "failed to build GPU device {dev_path}: {e}"
2528                                        ))
2529                                    })?;
2530                                devices.push(linux_device);
2531                            } else {
2532                                tracing::warn!(
2533                                    "GPU device {} not found on host, skipping",
2534                                    dev_path
2535                                );
2536                            }
2537                        }
2538                    }
2539                    "amd" => {
2540                        // AMD ROCm: /dev/kfd is always required for compute
2541                        let amd_always_devices = ["/dev/kfd"];
2542                        for dev_path in &amd_always_devices {
2543                            if let Ok((major, minor)) = get_device_major_minor(dev_path) {
2544                                let dev_type =
2545                                    get_device_type(dev_path).unwrap_or(LinuxDeviceType::C);
2546                                let linux_device = LinuxDeviceBuilder::default()
2547                                    .path((*dev_path).to_string())
2548                                    .typ(dev_type)
2549                                    .major(major)
2550                                    .minor(minor)
2551                                    .file_mode(0o666u32)
2552                                    .uid(0u32)
2553                                    .gid(0u32)
2554                                    .build()
2555                                    .map_err(|e| {
2556                                        AgentError::InvalidSpec(format!(
2557                                            "failed to build GPU device {dev_path}: {e}"
2558                                        ))
2559                                    })?;
2560                                devices.push(linux_device);
2561                            } else {
2562                                tracing::warn!(
2563                                    "GPU device {} not found on host, skipping",
2564                                    dev_path
2565                                );
2566                            }
2567                        }
2568
2569                        // DRI render nodes: /dev/dri/renderD128, renderD129, etc.
2570                        for i in &indices {
2571                            let dev_path = format!("/dev/dri/renderD{}", 128 + i);
2572                            if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2573                                let dev_type =
2574                                    get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2575                                let linux_device = LinuxDeviceBuilder::default()
2576                                    .path(dev_path.clone())
2577                                    .typ(dev_type)
2578                                    .major(major)
2579                                    .minor(minor)
2580                                    .file_mode(0o666u32)
2581                                    .uid(0u32)
2582                                    .gid(0u32)
2583                                    .build()
2584                                    .map_err(|e| {
2585                                        AgentError::InvalidSpec(format!(
2586                                            "failed to build GPU device {dev_path}: {e}"
2587                                        ))
2588                                    })?;
2589                                devices.push(linux_device);
2590                            } else {
2591                                tracing::warn!(
2592                                    "GPU device {} not found on host, skipping",
2593                                    dev_path
2594                                );
2595                            }
2596                        }
2597
2598                        // DRI card nodes: /dev/dri/card0, card1, etc.
2599                        for i in &indices {
2600                            let dev_path = format!("/dev/dri/card{i}");
2601                            if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2602                                let dev_type =
2603                                    get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2604                                let linux_device = LinuxDeviceBuilder::default()
2605                                    .path(dev_path.clone())
2606                                    .typ(dev_type)
2607                                    .major(major)
2608                                    .minor(minor)
2609                                    .file_mode(0o666u32)
2610                                    .uid(0u32)
2611                                    .gid(0u32)
2612                                    .build()
2613                                    .map_err(|e| {
2614                                        AgentError::InvalidSpec(format!(
2615                                            "failed to build GPU device {dev_path}: {e}"
2616                                        ))
2617                                    })?;
2618                                devices.push(linux_device);
2619                            } else {
2620                                tracing::warn!(
2621                                    "GPU device {} not found on host, skipping",
2622                                    dev_path
2623                                );
2624                            }
2625                        }
2626                    }
2627                    "intel" => {
2628                        // Intel GPU: DRI render nodes /dev/dri/renderD128, etc.
2629                        for i in &indices {
2630                            let dev_path = format!("/dev/dri/renderD{}", 128 + i);
2631                            if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2632                                let dev_type =
2633                                    get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2634                                let linux_device = LinuxDeviceBuilder::default()
2635                                    .path(dev_path.clone())
2636                                    .typ(dev_type)
2637                                    .major(major)
2638                                    .minor(minor)
2639                                    .file_mode(0o666u32)
2640                                    .uid(0u32)
2641                                    .gid(0u32)
2642                                    .build()
2643                                    .map_err(|e| {
2644                                        AgentError::InvalidSpec(format!(
2645                                            "failed to build GPU device {dev_path}: {e}"
2646                                        ))
2647                                    })?;
2648                                devices.push(linux_device);
2649                            } else {
2650                                tracing::warn!(
2651                                    "GPU device {} not found on host, skipping",
2652                                    dev_path
2653                                );
2654                            }
2655                        }
2656
2657                        // Intel DRI card nodes: /dev/dri/card0, card1, etc.
2658                        for i in &indices {
2659                            let dev_path = format!("/dev/dri/card{i}");
2660                            if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2661                                let dev_type =
2662                                    get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2663                                let linux_device = LinuxDeviceBuilder::default()
2664                                    .path(dev_path.clone())
2665                                    .typ(dev_type)
2666                                    .major(major)
2667                                    .minor(minor)
2668                                    .file_mode(0o666u32)
2669                                    .uid(0u32)
2670                                    .gid(0u32)
2671                                    .build()
2672                                    .map_err(|e| {
2673                                        AgentError::InvalidSpec(format!(
2674                                            "failed to build GPU device {dev_path}: {e}"
2675                                        ))
2676                                    })?;
2677                                devices.push(linux_device);
2678                            } else {
2679                                tracing::warn!(
2680                                    "GPU device {} not found on host, skipping",
2681                                    dev_path
2682                                );
2683                            }
2684                        }
2685                    }
2686                    other => {
2687                        // Unknown vendor - try DRI render nodes as default
2688                        tracing::warn!(
2689                            vendor = %other,
2690                            "Unknown GPU vendor, attempting DRI device passthrough"
2691                        );
2692                        for i in &indices {
2693                            let dev_path = format!("/dev/dri/renderD{}", 128 + i);
2694                            if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2695                                let dev_type =
2696                                    get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2697                                let linux_device = LinuxDeviceBuilder::default()
2698                                    .path(dev_path.clone())
2699                                    .typ(dev_type)
2700                                    .major(major)
2701                                    .minor(minor)
2702                                    .file_mode(0o666u32)
2703                                    .uid(0u32)
2704                                    .gid(0u32)
2705                                    .build()
2706                                    .map_err(|e| {
2707                                        AgentError::InvalidSpec(format!(
2708                                            "failed to build GPU device {dev_path}: {e}"
2709                                        ))
2710                                    })?;
2711                                devices.push(linux_device);
2712                            } else {
2713                                tracing::warn!(
2714                                    "GPU device {} not found on host, skipping",
2715                                    dev_path
2716                                );
2717                            }
2718                        }
2719                    }
2720                }
2721            }
2722
2723            Ok(devices)
2724        } // end #[cfg(unix)]
2725    }
2726
2727    /// Generate the OCI spec and write config.json to the bundle directory
2728    ///
2729    /// Unlike `build()`, this does NOT create the bundle directory or set up rootfs.
2730    /// Use this when the bundle directory and rootfs already exist (e.g., rootfs was
2731    /// extracted directly by `LayerUnpacker`).
2732    ///
2733    /// # Errors
2734    /// Returns an error if the OCI spec cannot be built or config.json cannot be written.
2735    ///
2736    /// # Returns
2737    /// The path to the bundle directory on success
2738    pub async fn write_config(
2739        &self,
2740        container_id: &ContainerId,
2741        spec: &ServiceSpec,
2742    ) -> Result<PathBuf> {
2743        // Generate OCI runtime spec
2744        let oci_spec = self
2745            .build_spec_only(container_id, spec, &self.volume_paths)
2746            .await?;
2747
2748        // Write config.json
2749        let config_path = self.bundle_dir.join("config.json");
2750        let config_json =
2751            serde_json::to_string_pretty(&oci_spec).map_err(|e| AgentError::CreateFailed {
2752                id: container_id.to_string(),
2753                reason: format!("failed to serialize OCI spec: {e}"),
2754            })?;
2755
2756        fs::write(&config_path, config_json)
2757            .await
2758            .map_err(|e| AgentError::CreateFailed {
2759                id: container_id.to_string(),
2760                reason: format!("failed to write config.json: {e}"),
2761            })?;
2762
2763        tracing::debug!(
2764            "Wrote OCI config.json at {} for container {}",
2765            config_path.display(),
2766            container_id
2767        );
2768
2769        Ok(self.bundle_dir.clone())
2770    }
2771
2772    /// Resolve command from `ServiceSpec` and optional image config following Docker/OCI semantics
2773    ///
2774    /// Resolution order:
2775    /// 1. spec entrypoint + args -> use those
2776    /// 2. spec entrypoint only -> use entrypoint
2777    /// 3. spec args only -> use args
2778    /// 4. `image_config` entrypoint/cmd -> use `image_config.full_command()`
2779    /// 5. fallback to /bin/sh
2780    fn resolve_command_from_spec(
2781        spec: &ServiceSpec,
2782        image_config: Option<&zlayer_registry::ImageConfig>,
2783    ) -> Vec<String> {
2784        let mut args = Vec::new();
2785
2786        match (&spec.command.entrypoint, &spec.command.args) {
2787            (Some(entrypoint), Some(cmd_args)) => {
2788                args.extend_from_slice(entrypoint);
2789                args.extend_from_slice(cmd_args);
2790            }
2791            (Some(entrypoint), None) => {
2792                args.extend_from_slice(entrypoint);
2793            }
2794            (None, Some(cmd_args)) if !cmd_args.is_empty() => {
2795                args.extend_from_slice(cmd_args);
2796            }
2797            _ => {
2798                // No spec command - try image config
2799                if let Some(img_cmd) =
2800                    image_config.and_then(zlayer_registry::ImageConfig::full_command)
2801                {
2802                    if img_cmd.is_empty() {
2803                        args.push("/bin/sh".to_string());
2804                    } else {
2805                        args.extend(img_cmd);
2806                    }
2807                } else {
2808                    args.push("/bin/sh".to_string());
2809                }
2810            }
2811        }
2812
2813        args
2814    }
2815
2816    /// Clean up a bundle directory
2817    ///
2818    /// Removes the bundle directory and all its contents.
2819    ///
2820    /// # Errors
2821    /// Returns an error if the bundle directory cannot be removed.
2822    pub async fn cleanup(&self) -> Result<()> {
2823        if self.bundle_dir.exists() {
2824            fs::remove_dir_all(&self.bundle_dir)
2825                .await
2826                .map_err(|e| AgentError::CreateFailed {
2827                    id: "cleanup".to_string(),
2828                    reason: format!(
2829                        "failed to remove bundle directory {}: {}",
2830                        self.bundle_dir.display(),
2831                        e
2832                    ),
2833                })?;
2834        }
2835        Ok(())
2836    }
2837}
2838
2839/// Create a bundle for a container
2840///
2841/// Convenience function that creates a bundle in the default location.
2842///
2843/// # Errors
2844/// Returns an error if bundle creation fails.
2845///
2846/// # Platform
2847/// Unix-only — wraps [`BundleBuilder::build`], which uses
2848/// `tokio::fs::symlink` (not available on Windows). Windows callers should
2849/// use [`BundleBuilder::build_spec_only`] directly and pipe the result into
2850/// a WSL2 delegate.
2851#[cfg(unix)]
2852pub async fn create_bundle(
2853    container_id: &ContainerId,
2854    spec: &ServiceSpec,
2855    rootfs_path: Option<PathBuf>,
2856) -> Result<PathBuf> {
2857    let mut builder =
2858        BundleBuilder::for_container(container_id).with_host_network(spec.host_network);
2859
2860    if let Some(rootfs) = rootfs_path {
2861        builder = builder.with_rootfs(rootfs);
2862    }
2863
2864    builder.build(container_id, spec).await
2865}
2866
2867/// Clean up a container's bundle
2868///
2869/// Convenience function to remove a bundle from the default location.
2870///
2871/// # Errors
2872/// Returns an error if cleanup fails.
2873pub async fn cleanup_bundle(container_id: &ContainerId) -> Result<()> {
2874    let builder = BundleBuilder::for_container(container_id);
2875    builder.cleanup().await
2876}
2877
2878#[cfg(test)]
2879mod tests {
2880    use super::*;
2881    use zlayer_spec::*;
2882
2883    fn mock_spec() -> ServiceSpec {
2884        serde_yaml::from_str::<DeploymentSpec>(
2885            r"
2886version: v1
2887deployment: test
2888services:
2889  test:
2890    rtype: service
2891    image:
2892      name: test:latest
2893    endpoints:
2894      - name: http
2895        protocol: http
2896        port: 8080
2897",
2898        )
2899        .unwrap()
2900        .services
2901        .remove("test")
2902        .unwrap()
2903    }
2904
2905    #[cfg(target_os = "linux")]
2906    fn mock_spec_with_resources() -> ServiceSpec {
2907        serde_yaml::from_str::<DeploymentSpec>(
2908            r"
2909version: v1
2910deployment: test
2911services:
2912  test:
2913    rtype: service
2914    image:
2915      name: test:latest
2916    resources:
2917      cpu: 0.5
2918      memory: 512Mi
2919    env:
2920      MY_VAR: my_value
2921      ANOTHER: value2
2922    endpoints:
2923      - name: http
2924        protocol: http
2925        port: 8080
2926",
2927        )
2928        .unwrap()
2929        .services
2930        .remove("test")
2931        .unwrap()
2932    }
2933
2934    #[cfg(target_os = "linux")]
2935    fn mock_privileged_spec() -> ServiceSpec {
2936        serde_yaml::from_str::<DeploymentSpec>(
2937            r"
2938version: v1
2939deployment: test
2940services:
2941  test:
2942    rtype: service
2943    image:
2944      name: test:latest
2945    privileged: true
2946    endpoints:
2947      - name: http
2948        protocol: http
2949        port: 8080
2950",
2951        )
2952        .unwrap()
2953        .services
2954        .remove("test")
2955        .unwrap()
2956    }
2957
2958    #[test]
2959    fn test_parse_memory_string() {
2960        assert_eq!(parse_memory_string("512Mi").unwrap(), 512 * 1024 * 1024);
2961        assert_eq!(parse_memory_string("1Gi").unwrap(), 1024 * 1024 * 1024);
2962        assert_eq!(parse_memory_string("2G").unwrap(), 2 * 1000 * 1000 * 1000);
2963        assert_eq!(parse_memory_string("1024").unwrap(), 1024);
2964        assert_eq!(parse_memory_string("512Ki").unwrap(), 512 * 1024);
2965    }
2966
2967    #[test]
2968    fn test_parse_memory_string_errors() {
2969        assert!(parse_memory_string("").is_err());
2970        assert!(parse_memory_string("abc").is_err());
2971        assert!(parse_memory_string("12.5Mi").is_err());
2972    }
2973
2974    #[test]
2975    fn test_generate_resolv_conf_single_nameserver() {
2976        let out = generate_resolv_conf(&["10.42.0.1".to_string()]);
2977        assert_eq!(out, "nameserver 10.42.0.1\noptions edns0\n");
2978    }
2979
2980    #[test]
2981    fn test_generate_resolv_conf_two_nameservers() {
2982        let out = generate_resolv_conf(&["10.42.0.1".to_string(), "fd00::1".to_string()]);
2983        assert_eq!(
2984            out,
2985            "nameserver 10.42.0.1\nnameserver fd00::1\noptions edns0\n"
2986        );
2987    }
2988
2989    #[cfg(target_os = "linux")]
2990    #[tokio::test]
2991    async fn test_build_oci_spec_injects_resolv_conf_mount() {
2992        let dir = tempfile::tempdir().unwrap();
2993        let id = ContainerId::new("test".to_string(), 1);
2994        let mut spec = mock_spec();
2995        spec.dns = vec!["10.42.0.1".to_string()];
2996        let builder = BundleBuilder::new(dir.path().to_path_buf());
2997
2998        let oci_spec = builder
2999            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3000            .await
3001            .unwrap();
3002
3003        let mounts = oci_spec.mounts().as_ref().expect("mounts present");
3004        let resolv_mount = mounts
3005            .iter()
3006            .find(|m| m.destination() == Path::new("/etc/resolv.conf"))
3007            .expect("resolv.conf mount injected");
3008        let source = resolv_mount.source().as_ref().unwrap();
3009        let written = std::fs::read_to_string(source).unwrap();
3010        assert_eq!(written, "nameserver 10.42.0.1\noptions edns0\n");
3011    }
3012
3013    #[cfg(target_os = "linux")]
3014    #[tokio::test]
3015    async fn test_build_oci_spec_no_resolv_conf_when_dns_empty() {
3016        let dir = tempfile::tempdir().unwrap();
3017        let id = ContainerId::new("test".to_string(), 1);
3018        let spec = mock_spec(); // spec.dns defaults to empty
3019        let builder = BundleBuilder::new(dir.path().to_path_buf());
3020
3021        let oci_spec = builder
3022            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3023            .await
3024            .unwrap();
3025
3026        let mounts = oci_spec.mounts().as_ref().expect("mounts present");
3027        assert!(
3028            !mounts
3029                .iter()
3030                .any(|m| m.destination() == Path::new("/etc/resolv.conf")),
3031            "no resolv.conf mount should be injected for empty spec.dns"
3032        );
3033    }
3034
3035    #[cfg(target_os = "linux")]
3036    #[tokio::test]
3037    async fn test_build_oci_spec_no_resolv_conf_when_host_network() {
3038        let dir = tempfile::tempdir().unwrap();
3039        let id = ContainerId::new("test".to_string(), 1);
3040        let mut spec = mock_spec();
3041        spec.dns = vec!["10.42.0.1".to_string()];
3042        spec.host_network = true;
3043        let builder = BundleBuilder::new(dir.path().to_path_buf());
3044
3045        let oci_spec = builder
3046            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3047            .await
3048            .unwrap();
3049
3050        let mounts = oci_spec.mounts().as_ref().expect("mounts present");
3051        assert!(
3052            !mounts
3053                .iter()
3054                .any(|m| m.destination() == Path::new("/etc/resolv.conf")),
3055            "host_network containers must inherit the host resolv.conf"
3056        );
3057    }
3058
3059    #[test]
3060    fn test_bundle_builder_new() {
3061        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3062        assert_eq!(builder.bundle_dir(), Path::new("/tmp/test-bundle"));
3063        assert!(builder.rootfs_path.is_none());
3064    }
3065
3066    #[test]
3067    fn test_bundle_builder_for_container() {
3068        let dirs = zlayer_paths::ZLayerDirs::system_default();
3069        let id = ContainerId::new("myservice".to_string(), 1);
3070        let builder = BundleBuilder::for_container(&id);
3071        assert_eq!(builder.bundle_dir(), dirs.bundles().join("myservice-rep-1"));
3072    }
3073
3074    #[test]
3075    fn test_bundle_builder_with_rootfs() {
3076        let dirs = zlayer_paths::ZLayerDirs::system_default();
3077        let builder = BundleBuilder::new("/tmp/test-bundle".into())
3078            .with_rootfs(dirs.rootfs().join("myimage"));
3079        assert_eq!(builder.rootfs_path, Some(dirs.rootfs().join("myimage")));
3080    }
3081
3082    #[cfg(target_os = "linux")]
3083    #[tokio::test]
3084    async fn test_build_oci_spec_basic() {
3085        let id = ContainerId::new("test".to_string(), 1);
3086        let spec = mock_spec();
3087        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3088
3089        let oci_spec = builder
3090            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3091            .await
3092            .unwrap();
3093
3094        assert_eq!(oci_spec.version(), "1.0.2");
3095        assert!(oci_spec.root().is_some());
3096        assert_eq!(
3097            oci_spec.root().as_ref().unwrap().path(),
3098            std::path::Path::new("rootfs")
3099        );
3100        assert!(oci_spec.process().is_some());
3101        assert!(oci_spec.linux().is_some());
3102    }
3103
3104    #[cfg(target_os = "linux")]
3105    #[tokio::test]
3106    async fn test_build_oci_spec_with_resources() {
3107        let id = ContainerId::new("test".to_string(), 1);
3108        let spec = mock_spec_with_resources();
3109        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3110
3111        let oci_spec = builder
3112            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3113            .await
3114            .unwrap();
3115
3116        // Check that resources are set
3117        let linux = oci_spec.linux().as_ref().unwrap();
3118        let resources = linux.resources().as_ref().unwrap();
3119
3120        // Check CPU
3121        let cpu = resources.cpu().as_ref().unwrap();
3122        assert_eq!(cpu.quota(), Some(50_000)); // 0.5 cores * 100000
3123        assert_eq!(cpu.period(), Some(100_000));
3124
3125        // Check memory
3126        let memory = resources.memory().as_ref().unwrap();
3127        assert_eq!(memory.limit(), Some(512 * 1024 * 1024)); // 512Mi
3128    }
3129
3130    #[cfg(target_os = "linux")]
3131    #[tokio::test]
3132    async fn test_build_oci_spec_translates_ulimits() {
3133        let id = ContainerId::new("test".to_string(), 1);
3134        let mut spec = mock_spec();
3135        spec.ulimits.insert(
3136            "nofile".to_string(),
3137            UlimitSpec {
3138                soft: 100_000,
3139                hard: 200_000,
3140            },
3141        );
3142        // Negative limits must clamp to 0 (matches the `.max(0)` conversion).
3143        spec.ulimits
3144            .insert("nproc".to_string(), UlimitSpec { soft: -1, hard: -5 });
3145        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3146
3147        let oci_spec = builder
3148            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3149            .await
3150            .unwrap();
3151
3152        let process = oci_spec.process().as_ref().expect("process present");
3153        let rlimits = process.rlimits().as_ref().expect("rlimits present");
3154
3155        // Exactly one nofile entry: our override fully replaces the oci
3156        // default (1024), it does not append a duplicate the kernel would
3157        // resolve ambiguously.
3158        let nofile: Vec<_> = rlimits
3159            .iter()
3160            .filter(|r| r.typ() == PosixRlimitType::RlimitNofile)
3161            .collect();
3162        assert_eq!(nofile.len(), 1, "nofile must not be duplicated");
3163        assert_eq!(nofile[0].soft(), 100_000);
3164        assert_eq!(nofile[0].hard(), 200_000);
3165
3166        let nproc = rlimits
3167            .iter()
3168            .find(|r| r.typ() == PosixRlimitType::RlimitNproc)
3169            .expect("nproc rlimit present");
3170        assert_eq!(nproc.soft(), 0, "negative soft clamps to 0");
3171        assert_eq!(nproc.hard(), 0, "negative hard clamps to 0");
3172    }
3173
3174    #[cfg(target_os = "linux")]
3175    #[tokio::test]
3176    async fn test_build_oci_spec_rejects_unknown_ulimit() {
3177        let id = ContainerId::new("test".to_string(), 1);
3178        let mut spec = mock_spec();
3179        spec.ulimits.insert(
3180            "not_a_real_ulimit".to_string(),
3181            UlimitSpec { soft: 1, hard: 1 },
3182        );
3183        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3184
3185        let err = builder
3186            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3187            .await
3188            .expect_err("unknown ulimit name must be rejected");
3189        assert!(
3190            err.to_string().contains("not_a_real_ulimit"),
3191            "error should name the unknown ulimit: {err}"
3192        );
3193    }
3194
3195    #[cfg(target_os = "linux")]
3196    #[tokio::test]
3197    async fn test_build_oci_spec_keeps_oci_default_rlimits_when_ulimits_empty() {
3198        // When `spec.ulimits` is empty we must NOT touch the process builder's
3199        // rlimits — the OCI default (`ProcessBuilder::default()` ships a single
3200        // `RLIMIT_NOFILE` of 1024, the kernel default). This documents the
3201        // exact baseline the ulimits override replaces, so a regression that
3202        // wipes the default (or, worse, our override) is caught here.
3203        let id = ContainerId::new("test".to_string(), 1);
3204        let spec = mock_spec();
3205        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3206
3207        let oci_spec = builder
3208            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3209            .await
3210            .unwrap();
3211
3212        let process = oci_spec.process().as_ref().expect("process present");
3213        let rlimits = process
3214            .rlimits()
3215            .as_ref()
3216            .expect("oci default rlimits present");
3217        let nofile = rlimits
3218            .iter()
3219            .find(|r| r.typ() == PosixRlimitType::RlimitNofile)
3220            .expect("default nofile rlimit present");
3221        // The oci-spec default the daemon would otherwise leak into the
3222        // container: 1024 — the exact value that EMFILE'd PlatformStore.
3223        assert_eq!(nofile.soft(), 1024);
3224        assert_eq!(nofile.hard(), 1024);
3225    }
3226
3227    #[cfg(target_os = "linux")]
3228    #[tokio::test]
3229    async fn test_build_oci_spec_privileged() {
3230        let id = ContainerId::new("test".to_string(), 1);
3231        let spec = mock_privileged_spec();
3232        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3233
3234        let oci_spec = builder
3235            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3236            .await
3237            .unwrap();
3238
3239        // Check that all capabilities are set
3240        let process = oci_spec.process().as_ref().unwrap();
3241        let caps = process.capabilities().as_ref().unwrap();
3242        let bounding = caps.bounding().as_ref().unwrap();
3243
3244        // Should have all capabilities
3245        assert!(bounding.contains(&Capability::SysAdmin));
3246        assert!(bounding.contains(&Capability::NetAdmin));
3247
3248        // Check that masked paths are NOT set for privileged
3249        let linux = oci_spec.linux().as_ref().unwrap();
3250        assert!(
3251            linux.masked_paths().is_none() || linux.masked_paths().as_ref().unwrap().is_empty()
3252        );
3253    }
3254
3255    #[cfg(target_os = "linux")]
3256    #[tokio::test]
3257    async fn test_build_oci_spec_environment() {
3258        let id = ContainerId::new("test".to_string(), 1);
3259        let spec = mock_spec_with_resources();
3260        let builder = BundleBuilder::new("/tmp/test-bundle".into())
3261            .with_env("EXTRA_VAR".to_string(), "extra_value".to_string());
3262
3263        let oci_spec = builder
3264            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3265            .await
3266            .unwrap();
3267
3268        let process = oci_spec.process().as_ref().unwrap();
3269        let env = process.env().as_ref().unwrap();
3270
3271        // Check service env vars are present
3272        assert!(env.iter().any(|e| e == "MY_VAR=my_value"));
3273        assert!(env.iter().any(|e| e == "ANOTHER=value2"));
3274        // Check extra env var is present
3275        assert!(env.iter().any(|e| e == "EXTRA_VAR=extra_value"));
3276        // Check PATH is present
3277        assert!(env.iter().any(|e| e.starts_with("PATH=")));
3278    }
3279
3280    #[cfg(target_os = "linux")]
3281    #[tokio::test]
3282    async fn test_build_namespaces() {
3283        let id = ContainerId::new("test".to_string(), 1);
3284        let spec = mock_spec();
3285        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3286
3287        let oci_spec = builder
3288            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3289            .await
3290            .unwrap();
3291        let linux = oci_spec.linux().as_ref().unwrap();
3292        let namespaces = linux.namespaces().as_ref().unwrap();
3293
3294        // Check we have the expected namespaces
3295        let namespace_types: Vec<_> = namespaces
3296            .iter()
3297            .map(oci_spec::runtime::LinuxNamespace::typ)
3298            .collect();
3299        assert!(namespace_types.contains(&LinuxNamespaceType::Pid));
3300        assert!(namespace_types.contains(&LinuxNamespaceType::Ipc));
3301        assert!(namespace_types.contains(&LinuxNamespaceType::Uts));
3302        assert!(namespace_types.contains(&LinuxNamespaceType::Mount));
3303        assert!(namespace_types.contains(&LinuxNamespaceType::Network));
3304    }
3305
3306    #[cfg(target_os = "linux")]
3307    #[tokio::test]
3308    async fn test_build_namespaces_host_network() {
3309        let id = ContainerId::new("test".to_string(), 1);
3310        let spec = mock_spec();
3311        let builder = BundleBuilder::new("/tmp/test-bundle".into()).with_host_network(true);
3312
3313        let oci_spec = builder
3314            .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3315            .await
3316            .unwrap();
3317        let linux = oci_spec.linux().as_ref().unwrap();
3318        let namespaces = linux.namespaces().as_ref().unwrap();
3319
3320        // Check we have the expected namespaces (NO Network namespace)
3321        let namespace_types: Vec<_> = namespaces
3322            .iter()
3323            .map(oci_spec::runtime::LinuxNamespace::typ)
3324            .collect();
3325        assert!(namespace_types.contains(&LinuxNamespaceType::Pid));
3326        assert!(namespace_types.contains(&LinuxNamespaceType::Ipc));
3327        assert!(namespace_types.contains(&LinuxNamespaceType::Uts));
3328        assert!(namespace_types.contains(&LinuxNamespaceType::Mount));
3329        assert!(
3330            !namespace_types.contains(&LinuxNamespaceType::Network),
3331            "Network namespace should NOT be present in host_network mode"
3332        );
3333    }
3334
3335    #[test]
3336    fn test_build_default_mounts() {
3337        let spec = mock_spec();
3338        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3339
3340        let mounts = builder.build_default_mounts(&spec).unwrap();
3341
3342        // Check we have the expected mounts
3343        let mount_destinations: Vec<_> = mounts
3344            .iter()
3345            .map(|m| m.destination().to_string_lossy().to_string())
3346            .collect();
3347        assert!(mount_destinations.contains(&"/proc".to_string()));
3348        assert!(mount_destinations.contains(&"/dev".to_string()));
3349        assert!(mount_destinations.contains(&"/dev/pts".to_string()));
3350        assert!(mount_destinations.contains(&"/dev/shm".to_string()));
3351        assert!(mount_destinations.contains(&"/sys".to_string()));
3352    }
3353
3354    #[test]
3355    fn test_build_storage_mounts_bind() {
3356        let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3357            r"
3358version: v1
3359deployment: test
3360services:
3361  test:
3362    image:
3363      name: test:latest
3364    storage:
3365      - type: bind
3366        source: /host/data
3367        target: /app/data
3368        readonly: true
3369",
3370        )
3371        .unwrap()
3372        .services
3373        .remove("test")
3374        .unwrap();
3375
3376        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3377        let volume_paths = std::collections::HashMap::new();
3378
3379        let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3380
3381        assert_eq!(mounts.len(), 1);
3382        assert_eq!(mounts[0].destination().to_string_lossy(), "/app/data");
3383        assert_eq!(
3384            mounts[0]
3385                .source()
3386                .as_ref()
3387                .map(|s| s.to_string_lossy().to_string()),
3388            Some("/host/data".to_string())
3389        );
3390        let options = mounts[0].options().as_ref().unwrap();
3391        assert!(options.contains(&"rbind".to_string()));
3392        assert!(options.contains(&"ro".to_string()));
3393    }
3394
3395    #[test]
3396    fn test_build_storage_mounts_named() {
3397        let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3398            r"
3399version: v1
3400deployment: test
3401services:
3402  test:
3403    image:
3404      name: test:latest
3405    storage:
3406      - type: named
3407        name: my-volume
3408        target: /app/data
3409",
3410        )
3411        .unwrap()
3412        .services
3413        .remove("test")
3414        .unwrap();
3415
3416        let dirs = zlayer_paths::ZLayerDirs::system_default();
3417        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3418        let mut volume_paths = std::collections::HashMap::new();
3419        volume_paths.insert("my-volume".to_string(), dirs.volumes().join("my-volume"));
3420
3421        let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3422
3423        assert_eq!(mounts.len(), 1);
3424        assert_eq!(mounts[0].destination().to_string_lossy(), "/app/data");
3425        assert_eq!(
3426            mounts[0]
3427                .source()
3428                .as_ref()
3429                .map(|s| s.to_string_lossy().to_string()),
3430            Some(
3431                dirs.volumes()
3432                    .join("my-volume")
3433                    .to_string_lossy()
3434                    .into_owned()
3435            )
3436        );
3437    }
3438
3439    #[test]
3440    fn test_build_storage_mounts_tmpfs() {
3441        let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3442            r"
3443version: v1
3444deployment: test
3445services:
3446  test:
3447    image:
3448      name: test:latest
3449    storage:
3450      - type: tmpfs
3451        target: /app/tmp
3452        size: 256Mi
3453        mode: 1777
3454",
3455        )
3456        .unwrap()
3457        .services
3458        .remove("test")
3459        .unwrap();
3460
3461        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3462        let volume_paths = std::collections::HashMap::new();
3463
3464        let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3465
3466        assert_eq!(mounts.len(), 1);
3467        assert_eq!(mounts[0].destination().to_string_lossy(), "/app/tmp");
3468        assert_eq!(mounts[0].typ().as_ref().map(String::as_str), Some("tmpfs"));
3469        let options = mounts[0].options().as_ref().unwrap();
3470        assert!(options.iter().any(|o| o.starts_with("size=")));
3471        assert!(options.iter().any(|o| o.starts_with("mode=")));
3472    }
3473
3474    #[test]
3475    fn test_build_storage_mounts_multiple() {
3476        let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3477            r"
3478version: v1
3479deployment: test
3480services:
3481  test:
3482    image:
3483      name: test:latest
3484    storage:
3485      - type: bind
3486        source: /etc/config
3487        target: /app/config
3488        readonly: true
3489      - type: named
3490        name: app-data
3491        target: /app/data
3492      - type: tmpfs
3493        target: /app/tmp
3494",
3495        )
3496        .unwrap()
3497        .services
3498        .remove("test")
3499        .unwrap();
3500
3501        let dirs = zlayer_paths::ZLayerDirs::system_default();
3502        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3503        let mut volume_paths = std::collections::HashMap::new();
3504        volume_paths.insert("app-data".to_string(), dirs.volumes().join("app-data"));
3505
3506        let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3507
3508        assert_eq!(mounts.len(), 3);
3509
3510        // Verify each mount is correct type
3511        let destinations: Vec<String> = mounts
3512            .iter()
3513            .map(|m| m.destination().to_string_lossy().to_string())
3514            .collect();
3515        assert!(destinations.contains(&"/app/config".to_string()));
3516        assert!(destinations.contains(&"/app/data".to_string()));
3517        assert!(destinations.contains(&"/app/tmp".to_string()));
3518    }
3519
3520    #[test]
3521    fn test_build_storage_mounts_anonymous_missing_path() {
3522        let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3523            r"
3524version: v1
3525deployment: test
3526services:
3527  test:
3528    image:
3529      name: test:latest
3530    storage:
3531      - type: anonymous
3532        target: /app/cache
3533",
3534        )
3535        .unwrap()
3536        .services
3537        .remove("test")
3538        .unwrap();
3539
3540        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3541        let volume_paths = std::collections::HashMap::new(); // No path provided
3542
3543        let result = builder.build_storage_mounts(&spec, &volume_paths);
3544
3545        // Should fail because anonymous volume path not prepared
3546        assert!(result.is_err());
3547    }
3548
3549    #[cfg(target_os = "linux")]
3550    #[tokio::test]
3551    async fn test_oci_spec_includes_storage_mounts() {
3552        let id = ContainerId::new("test".to_string(), 1);
3553        let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3554            r"
3555version: v1
3556deployment: test
3557services:
3558  test:
3559    image:
3560      name: test:latest
3561    storage:
3562      - type: bind
3563        source: /host/data
3564        target: /app/data
3565      - type: tmpfs
3566        target: /app/tmp
3567",
3568        )
3569        .unwrap()
3570        .services
3571        .remove("test")
3572        .unwrap();
3573
3574        let builder = BundleBuilder::new("/tmp/test-bundle".into());
3575        let volume_paths = std::collections::HashMap::new();
3576
3577        let oci_spec = builder
3578            .build_spec_only(&id, &spec, &volume_paths)
3579            .await
3580            .unwrap();
3581
3582        // Verify the OCI spec includes storage mounts
3583        let mounts = oci_spec.mounts().as_ref().unwrap();
3584        let destinations: Vec<String> = mounts
3585            .iter()
3586            .map(|m| m.destination().to_string_lossy().to_string())
3587            .collect();
3588
3589        // Should include both default mounts and storage mounts
3590        assert!(destinations.contains(&"/proc".to_string())); // default
3591        assert!(destinations.contains(&"/dev".to_string())); // default
3592        assert!(destinations.contains(&"/app/data".to_string())); // storage bind
3593        assert!(destinations.contains(&"/app/tmp".to_string())); // storage tmpfs
3594    }
3595
3596    fn mock_gpu_spec(vendor: &str, count: u32) -> ServiceSpec {
3597        let yaml = format!(
3598            "
3599version: v1
3600deployment: test
3601services:
3602  test:
3603    rtype: service
3604    image:
3605      name: test:latest
3606    resources:
3607      gpu:
3608        count: {count}
3609        vendor: {vendor}
3610    endpoints:
3611      - name: http
3612        protocol: http
3613        port: 8080
3614"
3615        );
3616        serde_yaml::from_str::<DeploymentSpec>(&yaml)
3617            .unwrap()
3618            .services
3619            .remove("test")
3620            .unwrap()
3621    }
3622
3623    fn write_nvidia_cdi_fixture(dir: &std::path::Path, json: &str) {
3624        std::fs::write(dir.join("nvidia.json"), json).unwrap();
3625    }
3626
3627    fn nvidia_cdi_fixture() -> &'static str {
3628        r#"{
3629            "cdiVersion": "0.6.0",
3630            "kind": "nvidia.com/gpu",
3631            "devices": [{
3632                "name": "0",
3633                "containerEdits": {
3634                    "deviceNodes": [
3635                        {"path": "/dev/nvidia0", "type": "c", "major": 195, "minor": 0}
3636                    ],
3637                    "env": ["NVIDIA_VISIBLE_DEVICES=0"],
3638                    "hooks": {
3639                        "createContainer": [{
3640                            "path": "/usr/bin/nvidia-container-runtime-hook",
3641                            "args": ["nvidia-container-runtime-hook", "prestart"]
3642                        }]
3643                    }
3644                }
3645            }]
3646        }"#
3647    }
3648
3649    #[cfg(target_os = "linux")]
3650    #[tokio::test]
3651    async fn gpu_spec_translates_to_cdi_device_nodes() {
3652        let dir = tempfile::tempdir().unwrap();
3653        write_nvidia_cdi_fixture(dir.path(), nvidia_cdi_fixture());
3654        let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3655
3656        let id = ContainerId::new("test".to_string(), 1);
3657        let spec = mock_gpu_spec("nvidia", 1);
3658        let builder = BundleBuilder::new("/tmp/test-bundle-cdi".into()).with_cdi_registry(registry);
3659
3660        let oci_spec = builder
3661            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3662            .await
3663            .expect("build with CDI fixture");
3664
3665        // CDI device node merged into linux.devices
3666        let linux = oci_spec.linux().as_ref().expect("linux config present");
3667        let devices = linux.devices().as_ref().expect("devices present");
3668        assert!(
3669            devices
3670                .iter()
3671                .any(|d| d.path() == std::path::Path::new("/dev/nvidia0")),
3672            "expected /dev/nvidia0 from CDI fixture; got {:?}",
3673            devices
3674                .iter()
3675                .map(oci_spec::runtime::LinuxDevice::path)
3676                .collect::<Vec<_>>()
3677        );
3678
3679        // CDI env var merged into process.env
3680        let process = oci_spec.process().as_ref().expect("process present");
3681        let env = process.env().as_ref().expect("env present");
3682        assert!(
3683            env.iter().any(|e| e == "NVIDIA_VISIBLE_DEVICES=0"),
3684            "expected NVIDIA_VISIBLE_DEVICES=0 in env; got {env:?}"
3685        );
3686
3687        // CDI hook merged into hooks.createContainer
3688        let hooks = oci_spec.hooks().as_ref().expect("hooks present");
3689        let create_container = hooks
3690            .create_container()
3691            .as_ref()
3692            .expect("createContainer hooks present");
3693        assert_eq!(create_container.len(), 1);
3694        assert_eq!(
3695            create_container[0].path(),
3696            &std::path::PathBuf::from("/usr/bin/nvidia-container-runtime-hook")
3697        );
3698    }
3699
3700    #[tokio::test]
3701    async fn gpu_spec_with_missing_cdi_returns_error() {
3702        // Empty tempdir — no CDI specs installed at all.
3703        let dir = tempfile::tempdir().unwrap();
3704        let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3705
3706        let id = ContainerId::new("test".to_string(), 1);
3707        let spec = mock_gpu_spec("nvidia", 1);
3708        let builder =
3709            BundleBuilder::new("/tmp/test-bundle-cdi-missing".into()).with_cdi_registry(registry);
3710
3711        let err = builder
3712            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3713            .await
3714            .expect_err("should fail when CDI registry is empty");
3715
3716        match err {
3717            AgentError::InvalidSpec(msg) => {
3718                assert!(
3719                    msg.contains("nvidia") || msg.contains("CDI"),
3720                    "error should mention CDI / vendor; got: {msg}"
3721                );
3722            }
3723            other => panic!("expected InvalidSpec, got {other:?}"),
3724        }
3725    }
3726
3727    #[tokio::test]
3728    async fn gpu_spec_with_unknown_device_returns_error() {
3729        // Spec has device "0" but the request will ask for two GPUs (so the
3730        // resolver will look for "1" and fail).
3731        let dir = tempfile::tempdir().unwrap();
3732        write_nvidia_cdi_fixture(dir.path(), nvidia_cdi_fixture());
3733        let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3734
3735        let id = ContainerId::new("test".to_string(), 1);
3736        let spec = mock_gpu_spec("nvidia", 2);
3737        let builder =
3738            BundleBuilder::new("/tmp/test-bundle-cdi-unknown".into()).with_cdi_registry(registry);
3739
3740        let err = builder
3741            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3742            .await
3743            .expect_err("should fail when device '1' is not declared");
3744        match err {
3745            AgentError::InvalidSpec(msg) => {
3746                assert!(
3747                    msg.contains("'1'") || msg.contains("device"),
3748                    "error should mention the missing device; got: {msg}"
3749                );
3750            }
3751            other => panic!("expected InvalidSpec, got {other:?}"),
3752        }
3753    }
3754
3755    #[cfg(target_os = "linux")]
3756    #[tokio::test]
3757    async fn gpu_spec_with_all_devices_expands_to_all_in_spec() {
3758        // Fixture with two declared devices ("0" and "1").
3759        let dir = tempfile::tempdir().unwrap();
3760        let fixture = r#"{
3761            "cdiVersion": "0.6.0",
3762            "kind": "nvidia.com/gpu",
3763            "devices": [
3764                {
3765                    "name": "0",
3766                    "containerEdits": {
3767                        "env": ["NVIDIA_VISIBLE_DEVICES=0"],
3768                        "deviceNodes": [
3769                            {"path": "/dev/nvidia0", "type": "c", "major": 195, "minor": 0}
3770                        ]
3771                    }
3772                },
3773                {
3774                    "name": "1",
3775                    "containerEdits": {
3776                        "env": ["NVIDIA_VISIBLE_DEVICES=1"],
3777                        "deviceNodes": [
3778                            {"path": "/dev/nvidia1", "type": "c", "major": 195, "minor": 1}
3779                        ]
3780                    }
3781                }
3782            ]
3783        }"#;
3784        write_nvidia_cdi_fixture(dir.path(), fixture);
3785        let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3786
3787        // Resolve "all" via the registry directly to validate expansion
3788        // semantics independently of how we map count -> names.
3789        let edits = registry
3790            .resolve_for_kind("nvidia.com/gpu", &["all".to_string()])
3791            .expect("resolve all");
3792        assert_eq!(edits.len(), 2);
3793
3794        // Now build the bundle for a 2-GPU service and confirm both nodes
3795        // land in linux.devices.
3796        let id = ContainerId::new("test".to_string(), 1);
3797        let spec = mock_gpu_spec("nvidia", 2);
3798        let builder =
3799            BundleBuilder::new("/tmp/test-bundle-cdi-all".into()).with_cdi_registry(registry);
3800
3801        let oci_spec = builder
3802            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3803            .await
3804            .expect("build with 2-device fixture");
3805
3806        let devices = oci_spec
3807            .linux()
3808            .as_ref()
3809            .unwrap()
3810            .devices()
3811            .as_ref()
3812            .expect("devices present");
3813        let paths: Vec<_> = devices.iter().map(|d| d.path().clone()).collect();
3814        assert!(paths.contains(&std::path::PathBuf::from("/dev/nvidia0")));
3815        assert!(paths.contains(&std::path::PathBuf::from("/dev/nvidia1")));
3816    }
3817
3818    /// Build the standard fixture-backed CDI registry used by the MPS /
3819    /// time-slicing tests. Identical to the helper used by the 5.A CDI
3820    /// tests above but expressed as a closure-style helper to keep each test
3821    /// self-contained.
3822    fn build_nvidia_cdi_registry(dir: &std::path::Path) -> std::sync::Arc<crate::cdi::CdiRegistry> {
3823        write_nvidia_cdi_fixture(dir, nvidia_cdi_fixture());
3824        std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir]))
3825    }
3826
3827    #[cfg(target_os = "linux")]
3828    #[tokio::test]
3829    async fn gpu_spec_with_mps_sharing_injects_env_and_mounts() {
3830        // Stage host-side MPS directories in a tempdir so the resolver's
3831        // `is_dir()` check passes without touching /tmp/nvidia-mps on the
3832        // real host.
3833        let cdi_dir = tempfile::tempdir().unwrap();
3834        let mps_root = tempfile::tempdir().unwrap();
3835        let pipe_dir = mps_root.path().join("nvidia-mps");
3836        let log_dir = mps_root.path().join("nvidia-log");
3837        std::fs::create_dir(&pipe_dir).unwrap();
3838        std::fs::create_dir(&log_dir).unwrap();
3839        let registry = build_nvidia_cdi_registry(cdi_dir.path());
3840
3841        let id = ContainerId::new("test".to_string(), 1);
3842        let mut spec = mock_gpu_spec("nvidia", 1);
3843        let gpu = spec.resources.gpu.as_mut().expect("gpu spec set");
3844        gpu.sharing = Some(zlayer_spec::GpuSharingMode::Mps);
3845        gpu.mps_pipe_dir = Some(pipe_dir.to_string_lossy().into_owned());
3846        gpu.mps_log_dir = Some(log_dir.to_string_lossy().into_owned());
3847
3848        let builder =
3849            BundleBuilder::new("/tmp/test-bundle-mps-env".into()).with_cdi_registry(registry);
3850        let oci_spec = builder
3851            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3852            .await
3853            .expect("build with MPS sharing");
3854
3855        let env = oci_spec
3856            .process()
3857            .as_ref()
3858            .and_then(|p| p.env().as_ref())
3859            .expect("env present");
3860        let pipe_expect = format!("CUDA_MPS_PIPE_DIRECTORY={}", pipe_dir.display());
3861        let log_expect = format!("CUDA_MPS_LOG_DIRECTORY={}", log_dir.display());
3862        assert!(
3863            env.iter().any(|e| e == &pipe_expect),
3864            "expected {pipe_expect} in env; got {env:?}"
3865        );
3866        assert!(
3867            env.iter().any(|e| e == &log_expect),
3868            "expected {log_expect} in env; got {env:?}"
3869        );
3870
3871        let mounts = oci_spec.mounts().as_ref().expect("mounts present");
3872        assert!(
3873            mounts
3874                .iter()
3875                .any(|m| m.destination() == &pipe_dir && m.source().as_ref() == Some(&pipe_dir)),
3876            "expected bind mount of MPS pipe dir {}; got destinations {:?}",
3877            pipe_dir.display(),
3878            mounts.iter().map(Mount::destination).collect::<Vec<_>>()
3879        );
3880        assert!(
3881            mounts
3882                .iter()
3883                .any(|m| m.destination() == &log_dir && m.source().as_ref() == Some(&log_dir)),
3884            "expected bind mount of MPS log dir {}",
3885            log_dir.display()
3886        );
3887    }
3888
3889    #[tokio::test]
3890    async fn gpu_spec_with_mps_sharing_fails_when_pipe_dir_missing() {
3891        let cdi_dir = tempfile::tempdir().unwrap();
3892        let registry = build_nvidia_cdi_registry(cdi_dir.path());
3893
3894        let id = ContainerId::new("test".to_string(), 1);
3895        let mut spec = mock_gpu_spec("nvidia", 1);
3896        let gpu = spec.resources.gpu.as_mut().expect("gpu spec set");
3897        gpu.sharing = Some(zlayer_spec::GpuSharingMode::Mps);
3898        // Path that demonstrably does not exist — tempdir() returns a unique
3899        // path so appending "definitely-not-here" gives a guaranteed miss.
3900        let missing = tempfile::tempdir().unwrap();
3901        let missing_path = missing.path().join("definitely-not-here");
3902        gpu.mps_pipe_dir = Some(missing_path.to_string_lossy().into_owned());
3903
3904        let builder =
3905            BundleBuilder::new("/tmp/test-bundle-mps-missing".into()).with_cdi_registry(registry);
3906        let err = builder
3907            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3908            .await
3909            .expect_err("should fail when MPS pipe dir is missing");
3910        match err {
3911            AgentError::GpuSharingUnavailable { mode, reason } => {
3912                assert_eq!(mode, "mps");
3913                assert!(
3914                    reason.contains("pipe") || reason.contains(&missing_path.display().to_string()),
3915                    "reason should mention the missing path; got: {reason}"
3916                );
3917            }
3918            other => panic!("expected GpuSharingUnavailable, got {other:?}"),
3919        }
3920    }
3921
3922    #[cfg(target_os = "linux")]
3923    #[tokio::test]
3924    async fn gpu_spec_with_timeslicing_injects_visible_devices() {
3925        let cdi_dir = tempfile::tempdir().unwrap();
3926        let registry = build_nvidia_cdi_registry(cdi_dir.path());
3927
3928        let id = ContainerId::new("test".to_string(), 1);
3929        let mut spec = mock_gpu_spec("nvidia", 1);
3930        let gpu = spec.resources.gpu.as_mut().expect("gpu spec set");
3931        gpu.sharing = Some(zlayer_spec::GpuSharingMode::TimeSlice);
3932        gpu.time_slice_index = Some(2);
3933
3934        let builder =
3935            BundleBuilder::new("/tmp/test-bundle-timeslice".into()).with_cdi_registry(registry);
3936        let oci_spec = builder
3937            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3938            .await
3939            .expect("build with time-slicing");
3940
3941        let env = oci_spec
3942            .process()
3943            .as_ref()
3944            .and_then(|p| p.env().as_ref())
3945            .expect("env present");
3946        // Time-slicing must clobber any earlier `CUDA_VISIBLE_DEVICES` (e.g.
3947        // the CDI-emitted full-device list) to advertise exactly the slice.
3948        let cuda_entries: Vec<&String> = env
3949            .iter()
3950            .filter(|e| e.starts_with("CUDA_VISIBLE_DEVICES="))
3951            .collect();
3952        assert_eq!(
3953            cuda_entries.len(),
3954            1,
3955            "exactly one CUDA_VISIBLE_DEVICES expected; got {cuda_entries:?}"
3956        );
3957        assert_eq!(cuda_entries[0], "CUDA_VISIBLE_DEVICES=2");
3958    }
3959
3960    #[cfg(target_os = "linux")]
3961    #[tokio::test]
3962    async fn gpu_spec_without_sharing_omits_mps_env() {
3963        let cdi_dir = tempfile::tempdir().unwrap();
3964        let registry = build_nvidia_cdi_registry(cdi_dir.path());
3965
3966        let id = ContainerId::new("test".to_string(), 1);
3967        let spec = mock_gpu_spec("nvidia", 1);
3968        assert!(spec.resources.gpu.as_ref().unwrap().sharing.is_none());
3969
3970        let builder =
3971            BundleBuilder::new("/tmp/test-bundle-no-sharing".into()).with_cdi_registry(registry);
3972        let oci_spec = builder
3973            .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3974            .await
3975            .expect("build without sharing");
3976
3977        let env = oci_spec
3978            .process()
3979            .as_ref()
3980            .and_then(|p| p.env().as_ref())
3981            .expect("env present");
3982        assert!(
3983            !env.iter().any(|e| e.starts_with("CUDA_MPS_")),
3984            "no CUDA_MPS_* env should be present without sharing; got {env:?}"
3985        );
3986
3987        // No MPS mount should be added either. The 5.A CDI fixture mounts a
3988        // /dev/nvidia0 device but never bind-mounts /tmp/nvidia-mps; verify
3989        // we don't sneak that in.
3990        let mounts = oci_spec.mounts().as_ref().expect("mounts present");
3991        assert!(
3992            !mounts
3993                .iter()
3994                .any(|m| { m.destination().to_string_lossy().contains("nvidia-mps") }),
3995            "no MPS pipe mount should be present without sharing"
3996        );
3997    }
3998
3999    #[cfg(unix)]
4000    mod subid_tests {
4001        use super::super::read_subid_range;
4002        use std::io::Write;
4003
4004        #[test]
4005        fn read_subid_range_returns_range_for_user() {
4006            let mut tmp = tempfile::NamedTempFile::new().unwrap();
4007            writeln!(tmp, "alice:100000:65536").unwrap();
4008            writeln!(tmp, "bob:165536:65536").unwrap();
4009            tmp.flush().unwrap();
4010            let path = tmp.path().to_str().unwrap();
4011            assert_eq!(read_subid_range(path, "bob"), Some((165_536, 65_536)));
4012            assert_eq!(read_subid_range(path, "alice"), Some((100_000, 65_536)));
4013        }
4014
4015        #[test]
4016        fn read_subid_range_returns_none_for_unknown_user() {
4017            let mut tmp = tempfile::NamedTempFile::new().unwrap();
4018            writeln!(tmp, "alice:100000:65536").unwrap();
4019            tmp.flush().unwrap();
4020            assert_eq!(
4021                read_subid_range(tmp.path().to_str().unwrap(), "carol"),
4022                None
4023            );
4024        }
4025
4026        #[test]
4027        fn read_subid_range_returns_none_on_missing_file() {
4028            assert_eq!(
4029                read_subid_range("/this/path/does/not/exist/subuid", "anyone"),
4030                None
4031            );
4032        }
4033    }
4034}