1use crate::cdi::{self, CdiContainerEdits, CdiRegistry};
9use crate::error::{AgentError, Result};
10use crate::runtime::ContainerId;
11use oci_spec::runtime::{
12 Capability, Hook, HookBuilder, Hooks, HooksBuilder, LinuxBuilder, LinuxCapabilitiesBuilder,
13 LinuxCpuBuilder, LinuxDeviceBuilder, LinuxDeviceCgroupBuilder, LinuxDeviceType,
14 LinuxMemoryBuilder, LinuxNamespaceBuilder, LinuxNamespaceType, LinuxResourcesBuilder, Mount,
15 MountBuilder, ProcessBuilder, RootBuilder, Spec, SpecBuilder, UserBuilder,
16};
17#[cfg(unix)]
20use oci_spec::runtime::LinuxIdMappingBuilder;
21use std::collections::{HashMap, HashSet};
22use std::path::{Path, PathBuf};
31use std::str::FromStr;
32use std::sync::Arc;
33use tokio::fs;
34use zlayer_secrets::SecretsProvider;
35use zlayer_spec::{GpuSharingMode, ServiceSpec, StorageSpec, StorageTier};
36
37const DEFAULT_MPS_PIPE_DIR: &str = "/tmp/nvidia-mps";
40
41const DEFAULT_MPS_LOG_DIR: &str = "/tmp/nvidia-log";
44
45const TIMESLICE_CONFIG_CONTAINER_PATH: &str = "/etc/nvidia/gpu-time-slicing.yaml";
50
51struct MpsDirs {
57 pipe_dir: PathBuf,
58 log_dir: PathBuf,
59}
60
61fn resolve_mps_dirs(gpu: &zlayer_spec::GpuSpec) -> Result<Option<MpsDirs>> {
71 if gpu.sharing != Some(GpuSharingMode::Mps) {
72 return Ok(None);
73 }
74
75 let pipe_dir = PathBuf::from(gpu.mps_pipe_dir.as_deref().unwrap_or(DEFAULT_MPS_PIPE_DIR));
76 let log_dir = PathBuf::from(gpu.mps_log_dir.as_deref().unwrap_or(DEFAULT_MPS_LOG_DIR));
77
78 if !pipe_dir.is_dir() {
79 return Err(AgentError::GpuSharingUnavailable {
80 mode: "mps".to_string(),
81 reason: format!(
82 "MPS pipe directory {} does not exist; ensure nvidia-cuda-mps-control is running",
83 pipe_dir.display()
84 ),
85 });
86 }
87 if !log_dir.is_dir() {
88 return Err(AgentError::GpuSharingUnavailable {
89 mode: "mps".to_string(),
90 reason: format!(
91 "MPS log directory {} does not exist; ensure nvidia-cuda-mps-control is running",
92 log_dir.display()
93 ),
94 });
95 }
96
97 Ok(Some(MpsDirs { pipe_dir, log_dir }))
98}
99
100fn cdi_node_to_oci_device(
109 node: &crate::cdi::CdiDeviceNode,
110) -> Result<oci_spec::runtime::LinuxDevice> {
111 let host_path = node.host_path.as_deref().unwrap_or(&node.path);
112
113 let dev_type = match node.device_type.as_deref() {
114 Some("c" | "u") => LinuxDeviceType::C,
115 Some("b") => LinuxDeviceType::B,
116 Some("p") => LinuxDeviceType::P,
117 _ => get_device_type(host_path).unwrap_or(LinuxDeviceType::C),
118 };
119
120 let (major, minor) = if let (Some(maj), Some(min)) = (node.major, node.minor) {
121 (maj, min)
122 } else {
123 get_device_major_minor(host_path).unwrap_or((0, 0))
124 };
125
126 let mut builder = LinuxDeviceBuilder::default()
127 .path(node.path.clone())
128 .typ(dev_type)
129 .major(major)
130 .minor(minor);
131 if let Some(mode) = node.file_mode {
132 builder = builder.file_mode(mode);
133 } else {
134 builder = builder.file_mode(0o666u32);
135 }
136 builder = builder.uid(node.uid.unwrap_or(0));
137 builder = builder.gid(node.gid.unwrap_or(0));
138
139 builder.build().map_err(|e| {
140 AgentError::InvalidSpec(format!(
141 "failed to build CDI device {path}: {e}",
142 path = node.path
143 ))
144 })
145}
146
147fn convert_cdi_hook(cdi_hook: &crate::cdi::CdiHook) -> Result<Hook> {
149 let mut builder = HookBuilder::default().path(PathBuf::from(&cdi_hook.path));
150 if !cdi_hook.args.is_empty() {
151 builder = builder.args(cdi_hook.args.clone());
152 }
153 if !cdi_hook.env.is_empty() {
154 builder = builder.env(cdi_hook.env.clone());
155 }
156 builder
157 .build()
158 .map_err(|e| AgentError::InvalidSpec(format!("failed to build CDI hook: {e}")))
159}
160
161const ALL_CAPABILITIES: &[Capability] = &[
163 Capability::AuditControl,
164 Capability::AuditRead,
165 Capability::AuditWrite,
166 Capability::BlockSuspend,
167 Capability::Bpf,
168 Capability::CheckpointRestore,
169 Capability::Chown,
170 Capability::DacOverride,
171 Capability::DacReadSearch,
172 Capability::Fowner,
173 Capability::Fsetid,
174 Capability::IpcLock,
175 Capability::IpcOwner,
176 Capability::Kill,
177 Capability::Lease,
178 Capability::LinuxImmutable,
179 Capability::MacAdmin,
180 Capability::MacOverride,
181 Capability::Mknod,
182 Capability::NetAdmin,
183 Capability::NetBindService,
184 Capability::NetBroadcast,
185 Capability::NetRaw,
186 Capability::Perfmon,
187 Capability::Setfcap,
188 Capability::Setgid,
189 Capability::Setpcap,
190 Capability::Setuid,
191 Capability::SysAdmin,
192 Capability::SysBoot,
193 Capability::SysChroot,
194 Capability::SysModule,
195 Capability::SysNice,
196 Capability::SysPacct,
197 Capability::SysPtrace,
198 Capability::SysRawio,
199 Capability::SysResource,
200 Capability::SysTime,
201 Capability::SysTtyConfig,
202 Capability::Syslog,
203 Capability::WakeAlarm,
204];
205
206pub fn parse_memory_string(s: &str) -> std::result::Result<u64, String> {
223 let s = s.trim();
224 if s.is_empty() {
225 return Err("empty memory string".to_string());
226 }
227
228 let (num_str, multiplier) = if let Some(n) = s.strip_suffix("Ki") {
229 (n, 1024u64)
230 } else if let Some(n) = s.strip_suffix("Mi") {
231 (n, 1024u64 * 1024)
232 } else if let Some(n) = s.strip_suffix("Gi") {
233 (n, 1024u64 * 1024 * 1024)
234 } else if let Some(n) = s.strip_suffix("Ti") {
235 (n, 1024u64 * 1024 * 1024 * 1024)
236 } else if let Some(n) = s.strip_suffix('K').or_else(|| s.strip_suffix('k')) {
237 (n, 1000u64)
238 } else if let Some(n) = s.strip_suffix('M').or_else(|| s.strip_suffix('m')) {
239 (n, 1000u64 * 1000)
240 } else if let Some(n) = s.strip_suffix('G').or_else(|| s.strip_suffix('g')) {
241 (n, 1000u64 * 1000 * 1000)
242 } else if let Some(n) = s.strip_suffix('T').or_else(|| s.strip_suffix('t')) {
243 (n, 1000u64 * 1000 * 1000 * 1000)
244 } else {
245 (s, 1u64)
246 };
247
248 let num: u64 = num_str
249 .parse()
250 .map_err(|e| format!("invalid number: {e}"))?;
251
252 Ok(num * multiplier)
253}
254
255#[cfg(unix)]
264#[allow(clippy::cast_possible_wrap)]
265fn get_device_major_minor(path: &str) -> std::io::Result<(i64, i64)> {
266 use std::os::unix::fs::MetadataExt;
267 let metadata = std::fs::metadata(path)?;
268 let rdev = metadata.rdev();
269 let major = ((rdev >> 8) & 0xff) as i64;
271 let minor = (rdev & 0xff) as i64;
272 Ok((major, minor))
273}
274
275#[cfg(not(unix))]
277fn get_device_major_minor(_path: &str) -> std::io::Result<(i64, i64)> {
278 Err(std::io::Error::new(
279 std::io::ErrorKind::Unsupported,
280 "device-cgroup probes require Unix",
281 ))
282}
283
284#[cfg(unix)]
289fn get_device_type(path: &str) -> std::io::Result<LinuxDeviceType> {
290 use std::os::unix::fs::FileTypeExt;
291 let metadata = std::fs::metadata(path)?;
292 let file_type = metadata.file_type();
293 if file_type.is_char_device() {
294 Ok(LinuxDeviceType::C)
295 } else if file_type.is_block_device() {
296 Ok(LinuxDeviceType::B)
297 } else {
298 Ok(LinuxDeviceType::U) }
300}
301
302#[cfg(not(unix))]
304fn get_device_type(_path: &str) -> std::io::Result<LinuxDeviceType> {
305 Err(std::io::Error::new(
306 std::io::ErrorKind::Unsupported,
307 "device-cgroup probes require Unix",
308 ))
309}
310
311#[derive(Clone)]
325pub struct BundleBuilder {
326 bundle_dir: PathBuf,
328 rootfs_path: Option<PathBuf>,
330 hostname: Option<String>,
332 extra_env: Vec<(String, String)>,
334 cwd: Option<String>,
336 args: Option<Vec<String>>,
338 volume_paths: HashMap<String, PathBuf>,
340 image_config: Option<zlayer_registry::ImageConfig>,
342 host_network: bool,
344 secrets_provider: Option<Arc<dyn SecretsProvider>>,
346 deployment_scope: Option<String>,
348 socket_path: Option<String>,
350 cdi_registry: Option<Arc<CdiRegistry>>,
357}
358
359impl std::fmt::Debug for BundleBuilder {
360 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
361 f.debug_struct("BundleBuilder")
362 .field("bundle_dir", &self.bundle_dir)
363 .field("rootfs_path", &self.rootfs_path)
364 .field("hostname", &self.hostname)
365 .field("extra_env", &self.extra_env)
366 .field("cwd", &self.cwd)
367 .field("args", &self.args)
368 .field("volume_paths", &self.volume_paths)
369 .field("image_config", &self.image_config)
370 .field("host_network", &self.host_network)
371 .field("secrets_provider", &self.secrets_provider.is_some())
372 .field("deployment_scope", &self.deployment_scope)
373 .field("socket_path", &self.socket_path)
374 .field("cdi_registry", &self.cdi_registry.is_some())
375 .finish()
376 }
377}
378
379#[cfg(unix)]
387fn build_rootless_id_mappings(
388 host_id: u32,
389 subid_path: &str,
390 username: &str,
391) -> Vec<oci_spec::runtime::LinuxIdMapping> {
392 let mut mappings = vec![LinuxIdMappingBuilder::default()
393 .container_id(0_u32)
394 .host_id(host_id)
395 .size(1_u32)
396 .build()
397 .unwrap()];
398 if !username.is_empty() {
399 if let Some((start, count)) = read_subid_range(subid_path, username) {
400 mappings.push(
401 LinuxIdMappingBuilder::default()
402 .container_id(1_u32)
403 .host_id(start)
404 .size(count)
405 .build()
406 .unwrap(),
407 );
408 }
409 }
410 mappings
411}
412
413#[cfg(unix)]
421fn read_subid_range(path: &str, username: &str) -> Option<(u32, u32)> {
422 let contents = std::fs::read_to_string(path).ok()?;
423 for line in contents.lines() {
424 let mut parts = line.splitn(3, ':');
425 let user = parts.next()?;
426 if user != username {
427 continue;
428 }
429 let start: u32 = parts.next()?.parse().ok()?;
430 let count: u32 = parts.next()?.parse().ok()?;
431 return Some((start, count));
432 }
433 None
434}
435
436impl BundleBuilder {
437 #[must_use]
447 pub fn new(bundle_dir: PathBuf) -> Self {
448 Self {
449 bundle_dir,
450 rootfs_path: None,
451 hostname: None,
452 extra_env: Vec::new(),
453 cwd: None,
454 args: None,
455 volume_paths: HashMap::new(),
456 image_config: None,
457 host_network: false,
458 secrets_provider: None,
459 deployment_scope: None,
460 socket_path: None,
461 cdi_registry: None,
462 }
463 }
464
465 #[must_use]
472 pub fn with_cdi_registry(mut self, registry: Arc<CdiRegistry>) -> Self {
473 self.cdi_registry = Some(registry);
474 self
475 }
476
477 #[must_use]
479 pub fn for_container(container_id: &ContainerId) -> Self {
480 let bundle_dir = zlayer_paths::ZLayerDirs::system_default()
481 .bundles()
482 .join(container_id.to_string());
483 Self::new(bundle_dir)
484 }
485
486 #[must_use]
490 pub fn with_rootfs(mut self, rootfs_path: PathBuf) -> Self {
491 self.rootfs_path = Some(rootfs_path);
492 self
493 }
494
495 #[must_use]
497 pub fn with_hostname(mut self, hostname: String) -> Self {
498 self.hostname = Some(hostname);
499 self
500 }
501
502 #[must_use]
504 pub fn with_env(mut self, key: String, value: String) -> Self {
505 self.extra_env.push((key, value));
506 self
507 }
508
509 #[must_use]
511 pub fn with_cwd(mut self, cwd: String) -> Self {
512 self.cwd = Some(cwd);
513 self
514 }
515
516 #[must_use]
518 pub fn with_args(mut self, args: Vec<String>) -> Self {
519 self.args = Some(args);
520 self
521 }
522
523 #[must_use]
528 pub fn with_volume_paths(mut self, volume_paths: HashMap<String, PathBuf>) -> Self {
529 self.volume_paths = volume_paths;
530 self
531 }
532
533 #[must_use]
538 pub fn with_image_config(mut self, config: zlayer_registry::ImageConfig) -> Self {
539 self.image_config = Some(config);
540 self
541 }
542
543 #[must_use]
549 pub fn with_host_network(mut self, host_network: bool) -> Self {
550 self.host_network = host_network;
551 self
552 }
553
554 #[must_use]
559 pub fn with_secrets_provider(mut self, provider: Arc<dyn SecretsProvider>) -> Self {
560 self.secrets_provider = Some(provider);
561 self
562 }
563
564 #[must_use]
569 pub fn with_deployment_scope(mut self, scope: String) -> Self {
570 self.deployment_scope = Some(scope);
571 self
572 }
573
574 #[must_use]
577 pub fn with_socket_mount(mut self, path: impl Into<String>) -> Self {
578 self.socket_path = Some(path.into());
579 self
580 }
581
582 #[must_use]
584 pub fn bundle_dir(&self) -> &Path {
585 &self.bundle_dir
586 }
587
588 #[cfg(unix)]
607 pub async fn build(&self, container_id: &ContainerId, spec: &ServiceSpec) -> Result<PathBuf> {
608 fs::create_dir_all(&self.bundle_dir)
610 .await
611 .map_err(|e| AgentError::CreateFailed {
612 id: container_id.to_string(),
613 reason: format!("failed to create bundle directory: {e}"),
614 })?;
615
616 let rootfs_in_bundle = self.bundle_dir.join("rootfs");
618 if let Some(ref rootfs_path) = self.rootfs_path {
619 let _ = fs::remove_file(&rootfs_in_bundle).await;
621 let _ = fs::remove_dir(&rootfs_in_bundle).await;
622
623 #[cfg(unix)]
628 tokio::fs::symlink(rootfs_path, &rootfs_in_bundle)
629 .await
630 .map_err(|e| AgentError::CreateFailed {
631 id: container_id.to_string(),
632 reason: format!(
633 "failed to symlink rootfs from {} to {}: {}",
634 rootfs_path.display(),
635 rootfs_in_bundle.display(),
636 e
637 ),
638 })?;
639
640 #[cfg(windows)]
641 tokio::fs::symlink_dir(rootfs_path, &rootfs_in_bundle)
642 .await
643 .map_err(|e| AgentError::CreateFailed {
644 id: container_id.to_string(),
645 reason: format!(
646 "failed to symlink rootfs from {} to {}: {}",
647 rootfs_path.display(),
648 rootfs_in_bundle.display(),
649 e
650 ),
651 })?;
652 } else {
653 fs::create_dir_all(&rootfs_in_bundle)
655 .await
656 .map_err(|e| AgentError::CreateFailed {
657 id: container_id.to_string(),
658 reason: format!("failed to create rootfs directory: {e}"),
659 })?;
660 }
661
662 let oci_spec = self
664 .build_spec_only(container_id, spec, &self.volume_paths)
665 .await?;
666
667 let config_path = self.bundle_dir.join("config.json");
669 let config_json =
670 serde_json::to_string_pretty(&oci_spec).map_err(|e| AgentError::CreateFailed {
671 id: container_id.to_string(),
672 reason: format!("failed to serialize OCI spec: {e}"),
673 })?;
674
675 fs::write(&config_path, config_json)
676 .await
677 .map_err(|e| AgentError::CreateFailed {
678 id: container_id.to_string(),
679 reason: format!("failed to write config.json: {e}"),
680 })?;
681
682 tracing::debug!(
683 "Created OCI bundle at {} for container {}",
684 self.bundle_dir.display(),
685 container_id
686 );
687
688 Ok(self.bundle_dir.clone())
689 }
690
691 pub async fn build_spec_only(
711 &self,
712 container_id: &ContainerId,
713 spec: &ServiceSpec,
714 volume_paths: &std::collections::HashMap<String, PathBuf>,
715 ) -> Result<oci_spec::runtime::Spec> {
716 self.build_oci_spec(container_id, spec, volume_paths).await
717 }
718
719 fn resolve_cdi_edits(&self, spec: &ServiceSpec) -> Result<Option<Vec<CdiContainerEdits>>> {
734 let Some(ref gpu) = spec.resources.gpu else {
735 return Ok(None);
736 };
737
738 let Some(kind) = cdi::vendor_to_cdi_kind(&gpu.vendor) else {
741 return Ok(None);
742 };
743
744 let (registry, strict) = if let Some(reg) = &self.cdi_registry {
750 (reg.clone(), true)
751 } else {
752 let reg = Arc::new(CdiRegistry::discover());
753 if reg.is_empty() {
754 return Ok(None);
755 }
756 (reg, false)
757 };
758
759 let device_names: Vec<String> = (0..gpu.count).map(|i| i.to_string()).collect();
760
761 match registry.resolve_for_kind(kind, &device_names) {
762 Ok(edits) => Ok(Some(edits)),
763 Err(err) => {
764 if strict {
765 Err(AgentError::InvalidSpec(format!(
766 "CDI resolution failed for vendor '{}': {err}",
767 gpu.vendor
768 )))
769 } else {
770 tracing::warn!(
771 vendor = %gpu.vendor,
772 kind = %kind,
773 error = %err,
774 "CDI resolution failed; falling back to baked-in GPU device passthrough"
775 );
776 Ok(None)
777 }
778 }
779 }
780 }
781
782 #[allow(clippy::too_many_lines)]
801 async fn build_oci_spec(
802 &self,
803 container_id: &ContainerId,
804 spec: &ServiceSpec,
805 volume_paths: &std::collections::HashMap<String, PathBuf>,
806 ) -> Result<Spec> {
807 let cdi_edits = self.resolve_cdi_edits(spec)?;
811
812 let user = {
814 let (uid, gid) = if let Some(user_str) = self
815 .image_config
816 .as_ref()
817 .and_then(|c| c.user.as_ref())
818 .filter(|u| !u.is_empty())
819 {
820 let parts: Vec<&str> = user_str.splitn(2, ':').collect();
822 let uid = parts[0].parse::<u32>().unwrap_or(0);
823 let gid = if parts.len() > 1 {
824 parts[1].parse::<u32>().unwrap_or(0)
825 } else {
826 uid
827 };
828 (uid, gid)
829 } else {
830 (0u32, 0u32)
831 };
832
833 UserBuilder::default()
834 .uid(uid)
835 .gid(gid)
836 .build()
837 .map_err(|e| AgentError::InvalidSpec(format!("failed to build user: {e}")))?
838 };
839
840 let mut env: Vec<String> = Vec::new();
843 let mut env_keys: HashSet<String> = HashSet::new();
844
845 if let Some(img_env) = self.image_config.as_ref().and_then(|c| c.env.as_ref()) {
847 for entry in img_env {
848 if let Some(key) = entry.split('=').next() {
849 env_keys.insert(key.to_string());
850 }
851 env.push(entry.clone());
852 }
853 }
854
855 if !env_keys.contains("PATH") {
857 env.push(
858 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(),
859 );
860 env_keys.insert("PATH".to_string());
861 }
862
863 if !env_keys.contains("TERM") {
865 env.push("TERM=xterm".to_string());
866 env_keys.insert("TERM".to_string());
867 }
868
869 if let (Some(secrets_provider), Some(scope)) =
876 (&self.secrets_provider, &self.deployment_scope)
877 {
878 let resolved_map =
879 crate::env::resolve_env_with_secrets(&spec.env, secrets_provider.as_ref(), scope)
880 .await
881 .map_err(|e| {
882 AgentError::InvalidSpec(format!(
883 "environment variable resolution failed: {e}"
884 ))
885 })?;
886
887 for (key, value) in &resolved_map {
888 if env_keys.contains(key.as_str()) {
889 env.retain(|e| e.split('=').next() != Some(key.as_str()));
890 }
891 env_keys.insert(key.clone());
892 env.push(format!("{key}={value}"));
893 }
894 } else {
895 let resolved = crate::env::resolve_env_vars_with_warnings(&spec.env).map_err(|e| {
896 AgentError::InvalidSpec(format!("environment variable resolution failed: {e}"))
897 })?;
898
899 for warning in &resolved.warnings {
901 tracing::warn!(container = %container_id, "{}", warning);
902 }
903
904 for var in &resolved.vars {
906 if let Some(key) = var.split('=').next() {
907 if env_keys.contains(key) {
908 env.retain(|e| e.split('=').next() != Some(key));
910 }
911 env_keys.insert(key.to_string());
912 }
913 env.push(var.clone());
914 }
915 }
916
917 for (key, value) in &self.extra_env {
919 if env_keys.contains(key.as_str()) {
920 env.retain(|e| e.split('=').next() != Some(key.as_str()));
921 }
922 env_keys.insert(key.clone());
923 env.push(format!("{key}={value}"));
924 }
925
926 if let Some(ref edits_per_device) = cdi_edits {
935 for edits in edits_per_device {
936 for entry in &edits.env {
937 if let Some(key) = entry.split('=').next() {
938 if env_keys.contains(key) {
939 env.retain(|e| e.split('=').next() != Some(key));
940 }
941 env_keys.insert(key.to_string());
942 }
943 env.push(entry.clone());
944 }
945 }
946 } else if let Some(ref gpu) = spec.resources.gpu {
947 let indices: Vec<String> = (0..gpu.count).map(|i| i.to_string()).collect();
949 let device_list = indices.join(",");
950 match gpu.vendor.as_str() {
951 "nvidia" => {
952 env.push(format!("NVIDIA_VISIBLE_DEVICES={device_list}"));
953 env.push(format!("CUDA_VISIBLE_DEVICES={device_list}"));
954 }
955 "amd" => {
956 env.push(format!("ROCR_VISIBLE_DEVICES={device_list}"));
957 env.push(format!("HIP_VISIBLE_DEVICES={device_list}"));
958 }
959 "intel" => {
960 env.push(format!("ZE_AFFINITY_MASK={device_list}"));
961 }
962 _ => {}
963 }
964 }
965
966 let mps_dirs = if let Some(ref gpu) = spec.resources.gpu {
979 resolve_mps_dirs(gpu)?
980 } else {
981 None
982 };
983 if let Some(ref dirs) = mps_dirs {
984 let pipe = format!("CUDA_MPS_PIPE_DIRECTORY={}", dirs.pipe_dir.display());
985 let log = format!("CUDA_MPS_LOG_DIRECTORY={}", dirs.log_dir.display());
986 if env_keys.contains("CUDA_MPS_PIPE_DIRECTORY") {
987 env.retain(|e| e.split('=').next() != Some("CUDA_MPS_PIPE_DIRECTORY"));
988 }
989 if env_keys.contains("CUDA_MPS_LOG_DIRECTORY") {
990 env.retain(|e| e.split('=').next() != Some("CUDA_MPS_LOG_DIRECTORY"));
991 }
992 env_keys.insert("CUDA_MPS_PIPE_DIRECTORY".to_string());
993 env_keys.insert("CUDA_MPS_LOG_DIRECTORY".to_string());
994 env.push(pipe);
995 env.push(log);
996 }
997 if let Some(ref gpu) = spec.resources.gpu {
998 if gpu.sharing == Some(GpuSharingMode::TimeSlice) {
999 if let Some(idx) = gpu.time_slice_index {
1000 env.retain(|e| e.split('=').next() != Some("CUDA_VISIBLE_DEVICES"));
1005 env_keys.insert("CUDA_VISIBLE_DEVICES".to_string());
1006 env.push(format!("CUDA_VISIBLE_DEVICES={idx}"));
1007 }
1008 }
1009 }
1010
1011 if let Some(ref gpu) = spec.resources.gpu {
1015 if let Some(ref dist) = gpu.distributed {
1016 env.push(format!("MASTER_PORT={}", dist.master_port));
1017 env.push(format!("MASTER_ADDR={}", container_id.service));
1018 env.push("WORLD_SIZE=1".to_string());
1019 env.push("RANK=0".to_string());
1020 env.push("LOCAL_RANK=0".to_string());
1021 match dist.backend.as_str() {
1022 "nccl" => env.push("NCCL_SOCKET_IFNAME=eth0".to_string()),
1023 "gloo" => env.push("GLOO_SOCKET_IFNAME=eth0".to_string()),
1024 _ => {}
1025 }
1026 }
1027 }
1028
1029 let capabilities = self.build_capabilities(spec)?;
1031
1032 let cwd = self
1034 .cwd
1035 .clone()
1036 .or_else(|| spec.command.workdir.clone())
1037 .or_else(|| {
1038 self.image_config
1039 .as_ref()
1040 .and_then(|c| c.working_dir.as_ref())
1041 .filter(|w| !w.is_empty())
1042 .cloned()
1043 })
1044 .unwrap_or_else(|| "/".to_string());
1045
1046 let process_args = if let Some(ref args) = self.args {
1048 args.clone()
1049 } else {
1050 Self::resolve_command_from_spec(spec, self.image_config.as_ref())
1051 };
1052
1053 let mut process_builder = ProcessBuilder::default()
1055 .terminal(false)
1056 .user(user)
1057 .env(env)
1058 .args(process_args)
1059 .cwd(cwd)
1060 .no_new_privileges(!spec.privileged && spec.capabilities.is_empty());
1061
1062 if let Some(caps) = capabilities {
1064 process_builder = process_builder.capabilities(caps);
1065 }
1066
1067 let process = process_builder
1068 .build()
1069 .map_err(|e| AgentError::InvalidSpec(format!("failed to build process: {e}")))?;
1070
1071 let root = RootBuilder::default()
1074 .path("rootfs".to_string())
1075 .readonly(false)
1076 .build()
1077 .map_err(|e| AgentError::InvalidSpec(format!("failed to build root: {e}")))?;
1078
1079 let mut mounts = self.build_default_mounts(spec)?;
1081
1082 let storage_mounts = self.build_storage_mounts(spec, volume_paths)?;
1084 mounts.extend(storage_mounts);
1085
1086 if let Some(ref socket_path) = self.socket_path {
1090 mounts.push(
1091 MountBuilder::default()
1092 .destination(zlayer_paths::ZLayerDirs::default_socket_path())
1093 .typ("bind")
1094 .source(socket_path.clone())
1095 .options(vec!["rbind".into(), "ro".into()])
1096 .build()
1097 .expect("valid socket mount"),
1098 );
1099 }
1100
1101 if let Some(ref edits_per_device) = cdi_edits {
1104 for edits in edits_per_device {
1105 for cdi_mount in &edits.mounts {
1106 let mut opts = cdi_mount.options.clone();
1107 if !opts.iter().any(|o| o == "bind" || o == "rbind") {
1108 opts.push("rbind".to_string());
1109 }
1110 mounts.push(
1111 MountBuilder::default()
1112 .destination(cdi_mount.container_path.clone())
1113 .typ("bind")
1114 .source(cdi_mount.host_path.clone())
1115 .options(opts)
1116 .build()
1117 .map_err(|e| {
1118 AgentError::InvalidSpec(format!("failed to build CDI mount: {e}"))
1119 })?,
1120 );
1121 }
1122 }
1123 }
1124
1125 if let Some(ref dirs) = mps_dirs {
1137 mounts.push(
1138 MountBuilder::default()
1139 .destination(dirs.pipe_dir.clone())
1140 .typ("bind")
1141 .source(dirs.pipe_dir.clone())
1142 .options(vec!["rbind".into(), "rw".into()])
1143 .build()
1144 .map_err(|e| {
1145 AgentError::InvalidSpec(format!("failed to build MPS pipe mount: {e}"))
1146 })?,
1147 );
1148 mounts.push(
1149 MountBuilder::default()
1150 .destination(dirs.log_dir.clone())
1151 .typ("bind")
1152 .source(dirs.log_dir.clone())
1153 .options(vec!["rbind".into(), "rw".into()])
1154 .build()
1155 .map_err(|e| {
1156 AgentError::InvalidSpec(format!("failed to build MPS log mount: {e}"))
1157 })?,
1158 );
1159 }
1160 if let Some(ref gpu) = spec.resources.gpu {
1161 if gpu.sharing == Some(GpuSharingMode::TimeSlice) {
1162 if let Some(ref cfg_path) = gpu.time_slicing_config_path {
1163 let host = PathBuf::from(cfg_path);
1164 if !host.is_file() {
1165 return Err(AgentError::GpuSharingUnavailable {
1166 mode: "time-slice".to_string(),
1167 reason: format!(
1168 "time-slicing config {} is not a regular file on the host",
1169 host.display()
1170 ),
1171 });
1172 }
1173 mounts.push(
1174 MountBuilder::default()
1175 .destination(PathBuf::from(TIMESLICE_CONFIG_CONTAINER_PATH))
1176 .typ("bind")
1177 .source(host)
1178 .options(vec!["rbind".into(), "ro".into()])
1179 .build()
1180 .map_err(|e| {
1181 AgentError::InvalidSpec(format!(
1182 "failed to build time-slicing config mount: {e}"
1183 ))
1184 })?,
1185 );
1186 }
1187 }
1188 }
1189
1190 let linux = self.build_linux_config(container_id, spec, cdi_edits.as_deref())?;
1192
1193 let hostname = self
1195 .hostname
1196 .clone()
1197 .unwrap_or_else(|| container_id.to_string());
1198
1199 let mut spec_builder = SpecBuilder::default()
1201 .version("1.0.2".to_string())
1202 .root(root)
1203 .process(process)
1204 .hostname(hostname)
1205 .mounts(mounts)
1206 .linux(linux);
1207
1208 if let Some(ref edits_per_device) = cdi_edits {
1209 if let Some(hooks) = Self::build_hooks_from_cdi(edits_per_device)? {
1210 spec_builder = spec_builder.hooks(hooks);
1211 }
1212 }
1213
1214 let oci_spec = spec_builder
1215 .build()
1216 .map_err(|e| AgentError::InvalidSpec(format!("failed to build OCI spec: {e}")))?;
1217
1218 Ok(oci_spec)
1219 }
1220
1221 fn build_hooks_from_cdi(edits_per_device: &[CdiContainerEdits]) -> Result<Option<Hooks>> {
1228 let mut prestart: Vec<Hook> = Vec::new();
1229 let mut create_runtime: Vec<Hook> = Vec::new();
1230 let mut create_container: Vec<Hook> = Vec::new();
1231 let mut start_container: Vec<Hook> = Vec::new();
1232 let mut poststart: Vec<Hook> = Vec::new();
1233 let mut poststop: Vec<Hook> = Vec::new();
1234
1235 for edits in edits_per_device {
1236 let Some(ref h) = edits.hooks else { continue };
1237 for hook in &h.prestart {
1238 prestart.push(convert_cdi_hook(hook)?);
1239 }
1240 for hook in &h.create_runtime {
1241 create_runtime.push(convert_cdi_hook(hook)?);
1242 }
1243 for hook in &h.create_container {
1244 create_container.push(convert_cdi_hook(hook)?);
1245 }
1246 for hook in &h.start_container {
1247 start_container.push(convert_cdi_hook(hook)?);
1248 }
1249 for hook in &h.poststart {
1250 poststart.push(convert_cdi_hook(hook)?);
1251 }
1252 for hook in &h.poststop {
1253 poststop.push(convert_cdi_hook(hook)?);
1254 }
1255 }
1256
1257 if prestart.is_empty()
1258 && create_runtime.is_empty()
1259 && create_container.is_empty()
1260 && start_container.is_empty()
1261 && poststart.is_empty()
1262 && poststop.is_empty()
1263 {
1264 return Ok(None);
1265 }
1266
1267 let mut builder = HooksBuilder::default();
1268 if !prestart.is_empty() {
1269 #[allow(deprecated)]
1270 {
1271 builder = builder.prestart(prestart);
1272 }
1273 }
1274 if !create_runtime.is_empty() {
1275 builder = builder.create_runtime(create_runtime);
1276 }
1277 if !create_container.is_empty() {
1278 builder = builder.create_container(create_container);
1279 }
1280 if !start_container.is_empty() {
1281 builder = builder.start_container(start_container);
1282 }
1283 if !poststart.is_empty() {
1284 builder = builder.poststart(poststart);
1285 }
1286 if !poststop.is_empty() {
1287 builder = builder.poststop(poststop);
1288 }
1289
1290 let hooks = builder
1291 .build()
1292 .map_err(|e| AgentError::InvalidSpec(format!("failed to build CDI hooks: {e}")))?;
1293 Ok(Some(hooks))
1294 }
1295
1296 #[allow(clippy::unused_self)]
1298 fn build_capabilities(
1299 &self,
1300 spec: &ServiceSpec,
1301 ) -> Result<Option<oci_spec::runtime::LinuxCapabilities>> {
1302 if spec.privileged {
1303 let all_caps: HashSet<Capability> = ALL_CAPABILITIES.iter().copied().collect();
1305 let empty_caps: HashSet<Capability> = HashSet::new();
1306
1307 let caps = LinuxCapabilitiesBuilder::default()
1308 .bounding(all_caps.clone())
1309 .effective(all_caps.clone())
1310 .permitted(all_caps)
1311 .inheritable(empty_caps.clone())
1312 .ambient(empty_caps)
1313 .build()
1314 .map_err(|e| {
1315 AgentError::InvalidSpec(format!("failed to build capabilities: {e}"))
1316 })?;
1317
1318 Ok(Some(caps))
1319 } else if !spec.capabilities.is_empty() {
1320 let caps: HashSet<Capability> = spec
1322 .capabilities
1323 .iter()
1324 .filter_map(|c| {
1325 let cap_name = if c.starts_with("CAP_") {
1327 c.to_uppercase()
1328 } else {
1329 format!("CAP_{}", c.to_uppercase())
1330 };
1331 Capability::from_str(&cap_name).ok()
1332 })
1333 .collect();
1334
1335 let empty_caps: HashSet<Capability> = HashSet::new();
1336
1337 let built_caps = LinuxCapabilitiesBuilder::default()
1338 .bounding(caps.clone())
1339 .effective(caps.clone())
1340 .permitted(caps)
1341 .inheritable(empty_caps.clone())
1342 .ambient(empty_caps)
1343 .build()
1344 .map_err(|e| {
1345 AgentError::InvalidSpec(format!("failed to build capabilities: {e}"))
1346 })?;
1347
1348 Ok(Some(built_caps))
1349 } else {
1350 let default_caps: HashSet<Capability> = [
1352 Capability::Chown,
1353 Capability::DacOverride,
1354 Capability::Fsetid,
1355 Capability::Fowner,
1356 Capability::Mknod,
1357 Capability::NetRaw,
1358 Capability::Setgid,
1359 Capability::Setuid,
1360 Capability::Setfcap,
1361 Capability::Setpcap,
1362 Capability::NetBindService,
1363 Capability::SysChroot,
1364 Capability::Kill,
1365 Capability::AuditWrite,
1366 ]
1367 .into_iter()
1368 .collect();
1369
1370 let empty_caps: HashSet<Capability> = HashSet::new();
1371
1372 let built_caps = LinuxCapabilitiesBuilder::default()
1373 .bounding(default_caps.clone())
1374 .effective(default_caps.clone())
1375 .permitted(default_caps)
1376 .inheritable(empty_caps.clone())
1377 .ambient(empty_caps)
1378 .build()
1379 .map_err(|e| {
1380 AgentError::InvalidSpec(format!("failed to build capabilities: {e}"))
1381 })?;
1382
1383 Ok(Some(built_caps))
1384 }
1385 }
1386
1387 #[allow(clippy::unused_self, clippy::too_many_lines)]
1389 fn build_default_mounts(&self, spec: &ServiceSpec) -> Result<Vec<Mount>> {
1390 let mut mounts = Vec::new();
1391
1392 mounts.push(
1394 MountBuilder::default()
1395 .destination("/proc".to_string())
1396 .typ("proc".to_string())
1397 .source("proc".to_string())
1398 .options(vec![
1399 "nosuid".to_string(),
1400 "noexec".to_string(),
1401 "nodev".to_string(),
1402 ])
1403 .build()
1404 .map_err(|e| {
1405 AgentError::InvalidSpec(format!("failed to build /proc mount: {e}"))
1406 })?,
1407 );
1408
1409 mounts.push(
1411 MountBuilder::default()
1412 .destination("/dev".to_string())
1413 .typ("tmpfs".to_string())
1414 .source("tmpfs".to_string())
1415 .options(vec![
1416 "nosuid".to_string(),
1417 "strictatime".to_string(),
1418 "mode=755".to_string(),
1419 "size=65536k".to_string(),
1420 ])
1421 .build()
1422 .map_err(|e| AgentError::InvalidSpec(format!("failed to build /dev mount: {e}")))?,
1423 );
1424
1425 mounts.push(
1427 MountBuilder::default()
1428 .destination("/dev/pts".to_string())
1429 .typ("devpts".to_string())
1430 .source("devpts".to_string())
1431 .options(vec![
1432 "nosuid".to_string(),
1433 "noexec".to_string(),
1434 "newinstance".to_string(),
1435 "ptmxmode=0666".to_string(),
1436 "mode=0620".to_string(),
1437 "gid=5".to_string(),
1438 ])
1439 .build()
1440 .map_err(|e| {
1441 AgentError::InvalidSpec(format!("failed to build /dev/pts mount: {e}"))
1442 })?,
1443 );
1444
1445 mounts.push(
1447 MountBuilder::default()
1448 .destination("/dev/shm".to_string())
1449 .typ("tmpfs".to_string())
1450 .source("shm".to_string())
1451 .options(vec![
1452 "nosuid".to_string(),
1453 "noexec".to_string(),
1454 "nodev".to_string(),
1455 "mode=1777".to_string(),
1456 "size=65536k".to_string(),
1457 ])
1458 .build()
1459 .map_err(|e| {
1460 AgentError::InvalidSpec(format!("failed to build /dev/shm mount: {e}"))
1461 })?,
1462 );
1463
1464 mounts.push(
1466 MountBuilder::default()
1467 .destination("/dev/mqueue".to_string())
1468 .typ("mqueue".to_string())
1469 .source("mqueue".to_string())
1470 .options(vec![
1471 "nosuid".to_string(),
1472 "noexec".to_string(),
1473 "nodev".to_string(),
1474 ])
1475 .build()
1476 .map_err(|e| {
1477 AgentError::InvalidSpec(format!("failed to build /dev/mqueue mount: {e}"))
1478 })?,
1479 );
1480
1481 let sys_options = if spec.privileged {
1483 vec![
1484 "nosuid".to_string(),
1485 "noexec".to_string(),
1486 "nodev".to_string(),
1487 ]
1488 } else {
1489 vec![
1490 "nosuid".to_string(),
1491 "noexec".to_string(),
1492 "nodev".to_string(),
1493 "ro".to_string(),
1494 ]
1495 };
1496
1497 mounts.push(
1498 MountBuilder::default()
1499 .destination("/sys".to_string())
1500 .typ("sysfs".to_string())
1501 .source("sysfs".to_string())
1502 .options(sys_options)
1503 .build()
1504 .map_err(|e| AgentError::InvalidSpec(format!("failed to build /sys mount: {e}")))?,
1505 );
1506
1507 mounts.push(
1509 MountBuilder::default()
1510 .destination("/sys/fs/cgroup".to_string())
1511 .typ("cgroup2".to_string())
1512 .source("cgroup".to_string())
1513 .options(vec![
1514 "nosuid".to_string(),
1515 "noexec".to_string(),
1516 "nodev".to_string(),
1517 "relatime".to_string(),
1518 ])
1519 .build()
1520 .map_err(|e| {
1521 AgentError::InvalidSpec(format!("failed to build cgroup mount: {e}"))
1522 })?,
1523 );
1524
1525 Ok(mounts)
1526 }
1527
1528 #[allow(clippy::unused_self, clippy::too_many_lines)]
1534 fn build_storage_mounts(
1535 &self,
1536 spec: &ServiceSpec,
1537 volume_paths: &std::collections::HashMap<String, PathBuf>,
1538 ) -> Result<Vec<Mount>> {
1539 let mut mounts = Vec::new();
1540
1541 for storage in &spec.storage {
1542 let mount = match storage {
1543 StorageSpec::Bind {
1544 source,
1545 target,
1546 readonly,
1547 } => {
1548 let mut options = vec!["rbind".to_string()];
1549 if *readonly {
1550 options.push("ro".to_string());
1551 } else {
1552 options.push("rw".to_string());
1553 }
1554
1555 MountBuilder::default()
1556 .destination(target.clone())
1557 .typ("none".to_string())
1558 .source(source.clone())
1559 .options(options)
1560 .build()
1561 .map_err(|e| {
1562 AgentError::InvalidSpec(format!(
1563 "failed to build bind mount for {target}: {e}"
1564 ))
1565 })?
1566 }
1567
1568 StorageSpec::Named {
1569 name,
1570 target,
1571 readonly,
1572 tier,
1573 ..
1574 } => {
1575 let source = volume_paths.get(name).ok_or_else(|| {
1577 AgentError::InvalidSpec(format!(
1578 "volume '{name}' not prepared - ensure StorageManager.ensure_volume() was called"
1579 ))
1580 })?;
1581
1582 if matches!(tier, StorageTier::Network) {
1584 tracing::warn!(
1585 volume = %name,
1586 tier = ?tier,
1587 "Network storage tier is NOT SQLite-safe. Avoid using SQLite databases on this volume."
1588 );
1589 }
1590
1591 let mut options = vec!["rbind".to_string()];
1592 if *readonly {
1593 options.push("ro".to_string());
1594 } else {
1595 options.push("rw".to_string());
1596 }
1597
1598 MountBuilder::default()
1599 .destination(target.clone())
1600 .typ("none".to_string())
1601 .source(source.to_string_lossy().to_string())
1602 .options(options)
1603 .build()
1604 .map_err(|e| {
1605 AgentError::InvalidSpec(format!(
1606 "failed to build named volume mount for {target}: {e}"
1607 ))
1608 })?
1609 }
1610
1611 StorageSpec::Anonymous { target, tier } => {
1612 let key = format!("_anon_{}", target.trim_start_matches('/').replace('/', "_"));
1615 let source = volume_paths.get(&key).ok_or_else(|| {
1616 AgentError::InvalidSpec(format!(
1617 "anonymous volume for '{target}' not prepared"
1618 ))
1619 })?;
1620
1621 if matches!(tier, StorageTier::Network) {
1622 tracing::warn!(
1623 target = %target,
1624 tier = ?tier,
1625 "Network storage tier is NOT SQLite-safe."
1626 );
1627 }
1628
1629 let options = vec!["rbind".to_string(), "rw".to_string()];
1630
1631 MountBuilder::default()
1632 .destination(target.clone())
1633 .typ("none".to_string())
1634 .source(source.to_string_lossy().to_string())
1635 .options(options)
1636 .build()
1637 .map_err(|e| {
1638 AgentError::InvalidSpec(format!(
1639 "failed to build anonymous volume mount for {target}: {e}"
1640 ))
1641 })?
1642 }
1643
1644 StorageSpec::Tmpfs { target, size, mode } => {
1645 let mut options = vec!["nosuid".to_string(), "nodev".to_string()];
1646
1647 if let Some(size_str) = size {
1648 options.push(format!("size={size_str}"));
1649 }
1650
1651 if let Some(mode_val) = mode {
1652 options.push(format!("mode={mode_val:o}"));
1653 }
1654
1655 MountBuilder::default()
1656 .destination(target.clone())
1657 .typ("tmpfs".to_string())
1658 .source("tmpfs".to_string())
1659 .options(options)
1660 .build()
1661 .map_err(|e| {
1662 AgentError::InvalidSpec(format!(
1663 "failed to build tmpfs mount for {target}: {e}"
1664 ))
1665 })?
1666 }
1667
1668 StorageSpec::S3 {
1669 bucket,
1670 prefix,
1671 target,
1672 readonly,
1673 endpoint: _,
1674 credentials: _,
1675 } => {
1676 let key = format!("_s3_{}_{}", bucket, prefix.as_deref().unwrap_or(""));
1679 let source = volume_paths.get(&key).ok_or_else(|| {
1680 AgentError::InvalidSpec(format!(
1681 "S3 volume for bucket '{bucket}' not mounted - ensure StorageManager.mount_s3() was called"
1682 ))
1683 })?;
1684
1685 tracing::warn!(
1686 bucket = %bucket,
1687 target = %target,
1688 "S3 storage is NOT SQLite-safe. Use for read-heavy workloads only."
1689 );
1690
1691 let mut options = vec!["rbind".to_string()];
1692 if *readonly {
1693 options.push("ro".to_string());
1694 } else {
1695 options.push("rw".to_string());
1696 }
1697
1698 MountBuilder::default()
1699 .destination(target.clone())
1700 .typ("none".to_string())
1701 .source(source.to_string_lossy().to_string())
1702 .options(options)
1703 .build()
1704 .map_err(|e| {
1705 AgentError::InvalidSpec(format!(
1706 "failed to build S3 mount for {target}: {e}"
1707 ))
1708 })?
1709 }
1710 };
1711
1712 mounts.push(mount);
1713 }
1714
1715 Ok(mounts)
1716 }
1717
1718 #[allow(clippy::similar_names)] #[allow(clippy::too_many_lines)]
1721 fn build_linux_config(
1722 &self,
1723 container_id: &ContainerId,
1724 spec: &ServiceSpec,
1725 cdi_edits: Option<&[CdiContainerEdits]>,
1726 ) -> Result<oci_spec::runtime::Linux> {
1727 let mut namespaces = vec![
1729 LinuxNamespaceBuilder::default()
1730 .typ(LinuxNamespaceType::Pid)
1731 .build()
1732 .unwrap(),
1733 LinuxNamespaceBuilder::default()
1734 .typ(LinuxNamespaceType::Ipc)
1735 .build()
1736 .unwrap(),
1737 LinuxNamespaceBuilder::default()
1738 .typ(LinuxNamespaceType::Uts)
1739 .build()
1740 .unwrap(),
1741 LinuxNamespaceBuilder::default()
1742 .typ(LinuxNamespaceType::Mount)
1743 .build()
1744 .unwrap(),
1745 ];
1746
1747 if !self.host_network {
1751 namespaces.push(
1752 LinuxNamespaceBuilder::default()
1753 .typ(LinuxNamespaceType::Network)
1754 .build()
1755 .unwrap(),
1756 );
1757 }
1758
1759 #[cfg(unix)]
1764 let rootless = !nix::unistd::geteuid().is_root();
1765 #[cfg(not(unix))]
1766 let rootless = false;
1767
1768 if rootless {
1769 namespaces.push(
1770 LinuxNamespaceBuilder::default()
1771 .typ(LinuxNamespaceType::User)
1772 .build()
1773 .unwrap(),
1774 );
1775 namespaces.push(
1776 LinuxNamespaceBuilder::default()
1777 .typ(LinuxNamespaceType::Cgroup)
1778 .build()
1779 .unwrap(),
1780 );
1781 }
1782
1783 let mut linux_builder = LinuxBuilder::default().namespaces(namespaces);
1784
1785 #[cfg(unix)]
1786 if rootless {
1787 let euid = nix::unistd::geteuid();
1788 let egid = nix::unistd::getegid();
1789 let username = nix::unistd::User::from_uid(euid)
1790 .ok()
1791 .flatten()
1792 .map(|u| u.name)
1793 .unwrap_or_default();
1794 linux_builder = linux_builder
1795 .uid_mappings(build_rootless_id_mappings(
1796 euid.as_raw(),
1797 "/etc/subuid",
1798 &username,
1799 ))
1800 .gid_mappings(build_rootless_id_mappings(
1801 egid.as_raw(),
1802 "/etc/subgid",
1803 &username,
1804 ));
1805 }
1806
1807 let resources = self.build_resources(spec)?;
1809 if let Some(resources) = resources {
1810 linux_builder = linux_builder.resources(resources);
1811 }
1812
1813 let mut devices = self.build_devices(spec, None, cdi_edits.is_some())?;
1820 if let Some(edits_per_device) = cdi_edits {
1821 for edits in edits_per_device {
1822 for node in &edits.device_nodes {
1823 devices.push(cdi_node_to_oci_device(node)?);
1824 }
1825 }
1826 }
1827 if !devices.is_empty() {
1828 linux_builder = linux_builder.devices(devices);
1829 }
1830
1831 linux_builder = linux_builder.rootfs_propagation("private".to_string());
1833
1834 if spec.privileged {
1836 linux_builder = linux_builder.masked_paths(vec![]).readonly_paths(vec![]);
1838 } else {
1839 let masked_paths = vec![
1841 "/proc/acpi".to_string(),
1842 "/proc/asound".to_string(),
1843 "/proc/kcore".to_string(),
1844 "/proc/keys".to_string(),
1845 "/proc/latency_stats".to_string(),
1846 "/proc/timer_list".to_string(),
1847 "/proc/timer_stats".to_string(),
1848 "/proc/sched_debug".to_string(),
1849 "/proc/scsi".to_string(),
1850 "/sys/firmware".to_string(),
1851 ];
1852
1853 let readonly_paths = vec![
1855 "/proc/bus".to_string(),
1856 "/proc/fs".to_string(),
1857 "/proc/irq".to_string(),
1858 "/proc/sys".to_string(),
1859 "/proc/sysrq-trigger".to_string(),
1860 ];
1861
1862 linux_builder = linux_builder
1863 .masked_paths(masked_paths)
1864 .readonly_paths(readonly_paths);
1865 }
1866
1867 let cid = container_id.to_string();
1879
1880 let explicit_parent: Option<(String, &'static str)> =
1884 if let Some(p) = spec.cgroup_parent.as_deref().filter(|s| !s.is_empty()) {
1885 Some((p.to_string(), "spec"))
1886 } else if let Some(p) = std::env::var("ZLAYER_CGROUP_PARENT")
1887 .ok()
1888 .filter(|s| !s.is_empty())
1889 {
1890 Some((p, "env"))
1891 } else {
1892 None
1893 };
1894
1895 #[cfg(target_os = "linux")]
1901 let auto_parent: Option<(String, &'static str)> =
1902 if let Some(p) = crate::capability::ensure_daemon_leaf_and_container_parent() {
1903 Some((p, "auto-init"))
1904 } else if let Some(p) = crate::capability::current_cgroup_v2_path() {
1905 Some((p, "auto"))
1908 } else {
1909 None
1910 };
1911 #[cfg(not(target_os = "linux"))]
1912 let auto_parent: Option<(String, &'static str)> = None;
1913
1914 let (cgroup_parent_value, cgroup_parent_source): (Option<String>, &'static str) =
1915 explicit_parent
1916 .or(auto_parent)
1917 .map_or((None, "none"), |(p, s)| (Some(p), s));
1918
1919 #[cfg(target_os = "linux")]
1926 if cgroup_parent_value.is_none() && crate::capability::DaemonCapabilities::get().is_nested {
1927 tracing::warn!(
1928 container_id = %cid,
1929 "capability survey reports nested daemon but cgroup_parent could not be resolved — proceeding with v2 root"
1930 );
1931 }
1932
1933 if let Some(parent) = cgroup_parent_value {
1934 let parent = parent.trim_end_matches('/');
1935 let full = format!("{parent}/{cid}");
1936 match cgroup_parent_source {
1937 "spec" => tracing::info!(
1938 container_id = %cid,
1939 source = "spec",
1940 path = %full,
1941 "cgroup_parent selected"
1942 ),
1943 "env" => tracing::info!(
1944 container_id = %cid,
1945 source = "env",
1946 path = %full,
1947 "cgroup_parent selected"
1948 ),
1949 "auto" => tracing::info!(
1950 container_id = %cid,
1951 source = "auto",
1952 path = %full,
1953 "cgroup_parent selected (from /proc/self/cgroup)"
1954 ),
1955 "auto-init" => tracing::info!(
1956 container_id = %cid,
1957 source = "auto-init",
1958 path = %full,
1959 "cgroup_parent selected (migrated daemon to <scope>/init; containers go under <scope>/containers)"
1960 ),
1961 _ => unreachable!(),
1962 }
1963 linux_builder = linux_builder.cgroups_path(std::path::PathBuf::from(full));
1964 } else {
1965 #[cfg(target_os = "linux")]
1974 {
1975 let caps = crate::capability::DaemonCapabilities::get();
1976 if !caps.can_write_cgroup_root {
1977 return Err(AgentError::InvalidSpec(format!(
1978 "cannot create container {cid}: no writable cgroup parent. \
1979 /proc/self/cgroup reports the cgroup-v2 root, and \
1980 /sys/fs/cgroup is read-only to this process. Fix one of: \
1981 (a) run the daemon's outer container with --cgroupns=host \
1982 so /proc/self/cgroup reports a real parent; \
1983 (b) set ZLAYER_CGROUP_PARENT=/path/to/writable/cgroup; \
1984 (c) grant the daemon write access to /sys/fs/cgroup."
1985 )));
1986 }
1987 tracing::info!(
1988 container_id = %cid,
1989 "cgroup_parent unset — libcontainer will use v2 root (cgroup root is writable here)"
1990 );
1991 }
1992 #[cfg(not(target_os = "linux"))]
1993 tracing::debug!(
1994 container_id = %cid,
1995 "non-Linux host — cgroup_parent unset; libcontainer inside the WSL distro will resolve a parent from its cgroup-v2 root"
1996 );
1997 }
1998
1999 linux_builder
2000 .build()
2001 .map_err(|e| AgentError::InvalidSpec(format!("failed to build linux config: {e}")))
2002 }
2003
2004 #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
2006 fn build_resources(
2007 &self,
2008 spec: &ServiceSpec,
2009 ) -> Result<Option<oci_spec::runtime::LinuxResources>> {
2010 let mut resources_builder = LinuxResourcesBuilder::default();
2011 let mut has_resources = false;
2012
2013 if let Some(cpu_limit) = spec.resources.cpu {
2015 let quota = (cpu_limit * 100_000.0) as i64;
2018 let cpu = LinuxCpuBuilder::default()
2019 .quota(quota)
2020 .period(100_000u64)
2021 .build()
2022 .map_err(|e| AgentError::InvalidSpec(format!("failed to build CPU limits: {e}")))?;
2023
2024 resources_builder = resources_builder.cpu(cpu);
2025 has_resources = true;
2026 }
2027
2028 if let Some(ref memory_str) = spec.resources.memory {
2030 let bytes = parse_memory_string(memory_str)
2031 .map_err(|e| AgentError::InvalidSpec(format!("invalid memory limit: {e}")))?;
2032
2033 let memory = LinuxMemoryBuilder::default()
2034 .limit(bytes as i64)
2035 .build()
2036 .map_err(|e| {
2037 AgentError::InvalidSpec(format!("failed to build memory limits: {e}"))
2038 })?;
2039
2040 resources_builder = resources_builder.memory(memory);
2041 has_resources = true;
2042 }
2043
2044 let device_rules = self.build_device_cgroup_rules(spec, None)?;
2046 if !device_rules.is_empty() {
2047 resources_builder = resources_builder.devices(device_rules);
2048 has_resources = true;
2049 }
2050
2051 if has_resources {
2052 let resources = resources_builder
2053 .build()
2054 .map_err(|e| AgentError::InvalidSpec(format!("failed to build resources: {e}")))?;
2055 Ok(Some(resources))
2056 } else {
2057 Ok(None)
2058 }
2059 }
2060
2061 #[allow(clippy::unused_self, clippy::too_many_lines)]
2063 fn build_device_cgroup_rules(
2064 &self,
2065 spec: &ServiceSpec,
2066 _gpu_indices: Option<&[u32]>,
2067 ) -> Result<Vec<oci_spec::runtime::LinuxDeviceCgroup>> {
2068 let mut rules = Vec::new();
2069
2070 if spec.privileged {
2071 let rule = LinuxDeviceCgroupBuilder::default()
2073 .allow(true)
2074 .access("rwm".to_string())
2075 .build()
2076 .map_err(|e| {
2077 AgentError::InvalidSpec(format!("failed to build device cgroup rule: {e}"))
2078 })?;
2079 rules.push(rule);
2080 } else {
2081 let deny_all = LinuxDeviceCgroupBuilder::default()
2083 .allow(false)
2084 .access("rwm".to_string())
2085 .build()
2086 .map_err(|e| AgentError::InvalidSpec(format!("failed to build deny rule: {e}")))?;
2087 rules.push(deny_all);
2088
2089 let standard_char_devices = [
2092 (1, 3, "rwm"), (1, 5, "rwm"), (1, 7, "rwm"), (1, 8, "rwm"), (1, 9, "rwm"), (5, 0, "rwm"), (5, 1, "rwm"), (5, 2, "rwm"), (136, -1, "rwm"), ];
2102
2103 for (major, minor, access) in standard_char_devices {
2104 let mut builder = LinuxDeviceCgroupBuilder::default()
2105 .allow(true)
2106 .typ(LinuxDeviceType::C)
2107 .major(i64::from(major))
2108 .access(access.to_string());
2109
2110 if minor >= 0 {
2111 builder = builder.minor(i64::from(minor));
2112 }
2113
2114 let rule = builder.build().map_err(|e| {
2115 AgentError::InvalidSpec(format!("failed to build char device rule: {e}"))
2116 })?;
2117 rules.push(rule);
2118 }
2119
2120 #[cfg(unix)]
2124 for device in &spec.devices {
2125 if let Ok((major, minor)) = get_device_major_minor(&device.path) {
2126 let dev_type = get_device_type(&device.path).unwrap_or(LinuxDeviceType::C);
2127
2128 let mut access = String::new();
2130 if device.read {
2131 access.push('r');
2132 }
2133 if device.write {
2134 access.push('w');
2135 }
2136 if device.mknod {
2137 access.push('m');
2138 }
2139 if access.is_empty() {
2140 access = "rw".to_string();
2141 }
2142
2143 let rule = LinuxDeviceCgroupBuilder::default()
2144 .allow(true)
2145 .typ(dev_type)
2146 .major(major)
2147 .minor(minor)
2148 .access(access)
2149 .build()
2150 .map_err(|e| {
2151 AgentError::InvalidSpec(format!(
2152 "failed to build device rule for {}: {}",
2153 device.path, e
2154 ))
2155 })?;
2156 rules.push(rule);
2157 } else {
2158 tracing::warn!("Failed to get device info for {}, skipping", device.path);
2159 }
2160 }
2161
2162 if let Some(ref gpu) = spec.resources.gpu {
2164 match gpu.vendor.as_str() {
2165 "nvidia" => {
2166 let rule = LinuxDeviceCgroupBuilder::default()
2168 .allow(true)
2169 .typ(LinuxDeviceType::C)
2170 .major(195i64)
2171 .access("rwm".to_string())
2172 .build()
2173 .map_err(|e| {
2174 AgentError::InvalidSpec(format!(
2175 "failed to build GPU cgroup rule: {e}"
2176 ))
2177 })?;
2178 rules.push(rule);
2179
2180 let uvm_rule = LinuxDeviceCgroupBuilder::default()
2182 .allow(true)
2183 .typ(LinuxDeviceType::C)
2184 .major(510i64)
2185 .access("rwm".to_string())
2186 .build()
2187 .map_err(|e| {
2188 AgentError::InvalidSpec(format!(
2189 "failed to build GPU UVM cgroup rule: {e}"
2190 ))
2191 })?;
2192 rules.push(uvm_rule);
2193 }
2194 "amd" => {
2195 let dri_rule = LinuxDeviceCgroupBuilder::default()
2197 .allow(true)
2198 .typ(LinuxDeviceType::C)
2199 .major(226i64)
2200 .access("rwm".to_string())
2201 .build()
2202 .map_err(|e| {
2203 AgentError::InvalidSpec(format!(
2204 "failed to build AMD DRI cgroup rule: {e}"
2205 ))
2206 })?;
2207 rules.push(dri_rule);
2208
2209 let kfd_rule = LinuxDeviceCgroupBuilder::default()
2211 .allow(true)
2212 .typ(LinuxDeviceType::C)
2213 .major(234i64)
2214 .access("rwm".to_string())
2215 .build()
2216 .map_err(|e| {
2217 AgentError::InvalidSpec(format!(
2218 "failed to build AMD KFD cgroup rule: {e}"
2219 ))
2220 })?;
2221 rules.push(kfd_rule);
2222 }
2223 "intel" => {
2224 let dri_rule = LinuxDeviceCgroupBuilder::default()
2226 .allow(true)
2227 .typ(LinuxDeviceType::C)
2228 .major(226i64)
2229 .access("rwm".to_string())
2230 .build()
2231 .map_err(|e| {
2232 AgentError::InvalidSpec(format!(
2233 "failed to build Intel DRI cgroup rule: {e}"
2234 ))
2235 })?;
2236 rules.push(dri_rule);
2237 }
2238 other => {
2239 tracing::warn!(
2241 vendor = %other,
2242 "Unknown GPU vendor, allowing DRI devices (major 226)"
2243 );
2244 let dri_rule = LinuxDeviceCgroupBuilder::default()
2245 .allow(true)
2246 .typ(LinuxDeviceType::C)
2247 .major(226i64)
2248 .access("rwm".to_string())
2249 .build()
2250 .map_err(|e| {
2251 AgentError::InvalidSpec(format!(
2252 "failed to build GPU DRI cgroup rule: {e}"
2253 ))
2254 })?;
2255 rules.push(dri_rule);
2256 }
2257 }
2258 }
2259 }
2260
2261 Ok(rules)
2262 }
2263
2264 #[allow(clippy::unused_self, clippy::too_many_lines)]
2273 #[cfg_attr(not(unix), allow(clippy::unnecessary_wraps, clippy::needless_return))]
2274 fn build_devices(
2275 &self,
2276 spec: &ServiceSpec,
2277 gpu_indices: Option<&[u32]>,
2278 skip_gpu_defaults: bool,
2279 ) -> Result<Vec<oci_spec::runtime::LinuxDevice>> {
2280 #[cfg(not(unix))]
2281 {
2282 let _ = (spec, gpu_indices, skip_gpu_defaults);
2283 return Ok(Vec::new());
2284 }
2285
2286 #[cfg(unix)]
2287 {
2288 let mut devices = Vec::new();
2289
2290 for device in &spec.devices {
2291 if let Ok((major, minor)) = get_device_major_minor(&device.path) {
2292 let dev_type = get_device_type(&device.path).unwrap_or(LinuxDeviceType::C);
2293
2294 let linux_device = LinuxDeviceBuilder::default()
2295 .path(device.path.clone())
2296 .typ(dev_type)
2297 .major(major)
2298 .minor(minor)
2299 .file_mode(0o666u32)
2300 .uid(0u32)
2301 .gid(0u32)
2302 .build()
2303 .map_err(|e| {
2304 AgentError::InvalidSpec(format!(
2305 "failed to build device {}: {}",
2306 device.path, e
2307 ))
2308 })?;
2309
2310 devices.push(linux_device);
2311 }
2312 }
2313
2314 if skip_gpu_defaults {
2319 return Ok(devices);
2320 }
2321
2322 if let Some(ref gpu) = spec.resources.gpu {
2324 let indices: Vec<u32> =
2325 gpu_indices.map_or_else(|| (0..gpu.count).collect(), <[u32]>::to_vec);
2326
2327 match gpu.vendor.as_str() {
2328 "nvidia" => {
2329 let always_devices =
2331 ["/dev/nvidiactl", "/dev/nvidia-uvm", "/dev/nvidia-uvm-tools"];
2332 for dev_path in &always_devices {
2333 if let Ok((major, minor)) = get_device_major_minor(dev_path) {
2334 let dev_type =
2335 get_device_type(dev_path).unwrap_or(LinuxDeviceType::C);
2336 let linux_device = LinuxDeviceBuilder::default()
2337 .path((*dev_path).to_string())
2338 .typ(dev_type)
2339 .major(major)
2340 .minor(minor)
2341 .file_mode(0o666u32)
2342 .uid(0u32)
2343 .gid(0u32)
2344 .build()
2345 .map_err(|e| {
2346 AgentError::InvalidSpec(format!(
2347 "failed to build GPU device {dev_path}: {e}"
2348 ))
2349 })?;
2350 devices.push(linux_device);
2351 } else {
2352 tracing::warn!(
2353 "GPU device {} not found on host, skipping",
2354 dev_path
2355 );
2356 }
2357 }
2358
2359 for i in &indices {
2361 let dev_path = format!("/dev/nvidia{i}");
2362 if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2363 let dev_type =
2364 get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2365 let linux_device = LinuxDeviceBuilder::default()
2366 .path(dev_path.clone())
2367 .typ(dev_type)
2368 .major(major)
2369 .minor(minor)
2370 .file_mode(0o666u32)
2371 .uid(0u32)
2372 .gid(0u32)
2373 .build()
2374 .map_err(|e| {
2375 AgentError::InvalidSpec(format!(
2376 "failed to build GPU device {dev_path}: {e}"
2377 ))
2378 })?;
2379 devices.push(linux_device);
2380 } else {
2381 tracing::warn!(
2382 "GPU device {} not found on host, skipping",
2383 dev_path
2384 );
2385 }
2386 }
2387 }
2388 "amd" => {
2389 let amd_always_devices = ["/dev/kfd"];
2391 for dev_path in &amd_always_devices {
2392 if let Ok((major, minor)) = get_device_major_minor(dev_path) {
2393 let dev_type =
2394 get_device_type(dev_path).unwrap_or(LinuxDeviceType::C);
2395 let linux_device = LinuxDeviceBuilder::default()
2396 .path((*dev_path).to_string())
2397 .typ(dev_type)
2398 .major(major)
2399 .minor(minor)
2400 .file_mode(0o666u32)
2401 .uid(0u32)
2402 .gid(0u32)
2403 .build()
2404 .map_err(|e| {
2405 AgentError::InvalidSpec(format!(
2406 "failed to build GPU device {dev_path}: {e}"
2407 ))
2408 })?;
2409 devices.push(linux_device);
2410 } else {
2411 tracing::warn!(
2412 "GPU device {} not found on host, skipping",
2413 dev_path
2414 );
2415 }
2416 }
2417
2418 for i in &indices {
2420 let dev_path = format!("/dev/dri/renderD{}", 128 + i);
2421 if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2422 let dev_type =
2423 get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2424 let linux_device = LinuxDeviceBuilder::default()
2425 .path(dev_path.clone())
2426 .typ(dev_type)
2427 .major(major)
2428 .minor(minor)
2429 .file_mode(0o666u32)
2430 .uid(0u32)
2431 .gid(0u32)
2432 .build()
2433 .map_err(|e| {
2434 AgentError::InvalidSpec(format!(
2435 "failed to build GPU device {dev_path}: {e}"
2436 ))
2437 })?;
2438 devices.push(linux_device);
2439 } else {
2440 tracing::warn!(
2441 "GPU device {} not found on host, skipping",
2442 dev_path
2443 );
2444 }
2445 }
2446
2447 for i in &indices {
2449 let dev_path = format!("/dev/dri/card{i}");
2450 if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2451 let dev_type =
2452 get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2453 let linux_device = LinuxDeviceBuilder::default()
2454 .path(dev_path.clone())
2455 .typ(dev_type)
2456 .major(major)
2457 .minor(minor)
2458 .file_mode(0o666u32)
2459 .uid(0u32)
2460 .gid(0u32)
2461 .build()
2462 .map_err(|e| {
2463 AgentError::InvalidSpec(format!(
2464 "failed to build GPU device {dev_path}: {e}"
2465 ))
2466 })?;
2467 devices.push(linux_device);
2468 } else {
2469 tracing::warn!(
2470 "GPU device {} not found on host, skipping",
2471 dev_path
2472 );
2473 }
2474 }
2475 }
2476 "intel" => {
2477 for i in &indices {
2479 let dev_path = format!("/dev/dri/renderD{}", 128 + i);
2480 if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2481 let dev_type =
2482 get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2483 let linux_device = LinuxDeviceBuilder::default()
2484 .path(dev_path.clone())
2485 .typ(dev_type)
2486 .major(major)
2487 .minor(minor)
2488 .file_mode(0o666u32)
2489 .uid(0u32)
2490 .gid(0u32)
2491 .build()
2492 .map_err(|e| {
2493 AgentError::InvalidSpec(format!(
2494 "failed to build GPU device {dev_path}: {e}"
2495 ))
2496 })?;
2497 devices.push(linux_device);
2498 } else {
2499 tracing::warn!(
2500 "GPU device {} not found on host, skipping",
2501 dev_path
2502 );
2503 }
2504 }
2505
2506 for i in &indices {
2508 let dev_path = format!("/dev/dri/card{i}");
2509 if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2510 let dev_type =
2511 get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2512 let linux_device = LinuxDeviceBuilder::default()
2513 .path(dev_path.clone())
2514 .typ(dev_type)
2515 .major(major)
2516 .minor(minor)
2517 .file_mode(0o666u32)
2518 .uid(0u32)
2519 .gid(0u32)
2520 .build()
2521 .map_err(|e| {
2522 AgentError::InvalidSpec(format!(
2523 "failed to build GPU device {dev_path}: {e}"
2524 ))
2525 })?;
2526 devices.push(linux_device);
2527 } else {
2528 tracing::warn!(
2529 "GPU device {} not found on host, skipping",
2530 dev_path
2531 );
2532 }
2533 }
2534 }
2535 other => {
2536 tracing::warn!(
2538 vendor = %other,
2539 "Unknown GPU vendor, attempting DRI device passthrough"
2540 );
2541 for i in &indices {
2542 let dev_path = format!("/dev/dri/renderD{}", 128 + i);
2543 if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2544 let dev_type =
2545 get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2546 let linux_device = LinuxDeviceBuilder::default()
2547 .path(dev_path.clone())
2548 .typ(dev_type)
2549 .major(major)
2550 .minor(minor)
2551 .file_mode(0o666u32)
2552 .uid(0u32)
2553 .gid(0u32)
2554 .build()
2555 .map_err(|e| {
2556 AgentError::InvalidSpec(format!(
2557 "failed to build GPU device {dev_path}: {e}"
2558 ))
2559 })?;
2560 devices.push(linux_device);
2561 } else {
2562 tracing::warn!(
2563 "GPU device {} not found on host, skipping",
2564 dev_path
2565 );
2566 }
2567 }
2568 }
2569 }
2570 }
2571
2572 Ok(devices)
2573 } }
2575
2576 pub async fn write_config(
2588 &self,
2589 container_id: &ContainerId,
2590 spec: &ServiceSpec,
2591 ) -> Result<PathBuf> {
2592 let oci_spec = self
2594 .build_spec_only(container_id, spec, &self.volume_paths)
2595 .await?;
2596
2597 let config_path = self.bundle_dir.join("config.json");
2599 let config_json =
2600 serde_json::to_string_pretty(&oci_spec).map_err(|e| AgentError::CreateFailed {
2601 id: container_id.to_string(),
2602 reason: format!("failed to serialize OCI spec: {e}"),
2603 })?;
2604
2605 fs::write(&config_path, config_json)
2606 .await
2607 .map_err(|e| AgentError::CreateFailed {
2608 id: container_id.to_string(),
2609 reason: format!("failed to write config.json: {e}"),
2610 })?;
2611
2612 tracing::debug!(
2613 "Wrote OCI config.json at {} for container {}",
2614 config_path.display(),
2615 container_id
2616 );
2617
2618 Ok(self.bundle_dir.clone())
2619 }
2620
2621 fn resolve_command_from_spec(
2630 spec: &ServiceSpec,
2631 image_config: Option<&zlayer_registry::ImageConfig>,
2632 ) -> Vec<String> {
2633 let mut args = Vec::new();
2634
2635 match (&spec.command.entrypoint, &spec.command.args) {
2636 (Some(entrypoint), Some(cmd_args)) => {
2637 args.extend_from_slice(entrypoint);
2638 args.extend_from_slice(cmd_args);
2639 }
2640 (Some(entrypoint), None) => {
2641 args.extend_from_slice(entrypoint);
2642 }
2643 (None, Some(cmd_args)) if !cmd_args.is_empty() => {
2644 args.extend_from_slice(cmd_args);
2645 }
2646 _ => {
2647 if let Some(img_cmd) =
2649 image_config.and_then(zlayer_registry::ImageConfig::full_command)
2650 {
2651 if img_cmd.is_empty() {
2652 args.push("/bin/sh".to_string());
2653 } else {
2654 args.extend(img_cmd);
2655 }
2656 } else {
2657 args.push("/bin/sh".to_string());
2658 }
2659 }
2660 }
2661
2662 args
2663 }
2664
2665 pub async fn cleanup(&self) -> Result<()> {
2672 if self.bundle_dir.exists() {
2673 fs::remove_dir_all(&self.bundle_dir)
2674 .await
2675 .map_err(|e| AgentError::CreateFailed {
2676 id: "cleanup".to_string(),
2677 reason: format!(
2678 "failed to remove bundle directory {}: {}",
2679 self.bundle_dir.display(),
2680 e
2681 ),
2682 })?;
2683 }
2684 Ok(())
2685 }
2686}
2687
2688#[cfg(unix)]
2701pub async fn create_bundle(
2702 container_id: &ContainerId,
2703 spec: &ServiceSpec,
2704 rootfs_path: Option<PathBuf>,
2705) -> Result<PathBuf> {
2706 let mut builder =
2707 BundleBuilder::for_container(container_id).with_host_network(spec.host_network);
2708
2709 if let Some(rootfs) = rootfs_path {
2710 builder = builder.with_rootfs(rootfs);
2711 }
2712
2713 builder.build(container_id, spec).await
2714}
2715
2716pub async fn cleanup_bundle(container_id: &ContainerId) -> Result<()> {
2723 let builder = BundleBuilder::for_container(container_id);
2724 builder.cleanup().await
2725}
2726
2727#[cfg(test)]
2728mod tests {
2729 use super::*;
2730 use zlayer_spec::*;
2731
2732 fn mock_spec() -> ServiceSpec {
2733 serde_yaml::from_str::<DeploymentSpec>(
2734 r"
2735version: v1
2736deployment: test
2737services:
2738 test:
2739 rtype: service
2740 image:
2741 name: test:latest
2742 endpoints:
2743 - name: http
2744 protocol: http
2745 port: 8080
2746",
2747 )
2748 .unwrap()
2749 .services
2750 .remove("test")
2751 .unwrap()
2752 }
2753
2754 #[cfg(target_os = "linux")]
2755 fn mock_spec_with_resources() -> ServiceSpec {
2756 serde_yaml::from_str::<DeploymentSpec>(
2757 r"
2758version: v1
2759deployment: test
2760services:
2761 test:
2762 rtype: service
2763 image:
2764 name: test:latest
2765 resources:
2766 cpu: 0.5
2767 memory: 512Mi
2768 env:
2769 MY_VAR: my_value
2770 ANOTHER: value2
2771 endpoints:
2772 - name: http
2773 protocol: http
2774 port: 8080
2775",
2776 )
2777 .unwrap()
2778 .services
2779 .remove("test")
2780 .unwrap()
2781 }
2782
2783 #[cfg(target_os = "linux")]
2784 fn mock_privileged_spec() -> ServiceSpec {
2785 serde_yaml::from_str::<DeploymentSpec>(
2786 r"
2787version: v1
2788deployment: test
2789services:
2790 test:
2791 rtype: service
2792 image:
2793 name: test:latest
2794 privileged: true
2795 endpoints:
2796 - name: http
2797 protocol: http
2798 port: 8080
2799",
2800 )
2801 .unwrap()
2802 .services
2803 .remove("test")
2804 .unwrap()
2805 }
2806
2807 #[test]
2808 fn test_parse_memory_string() {
2809 assert_eq!(parse_memory_string("512Mi").unwrap(), 512 * 1024 * 1024);
2810 assert_eq!(parse_memory_string("1Gi").unwrap(), 1024 * 1024 * 1024);
2811 assert_eq!(parse_memory_string("2G").unwrap(), 2 * 1000 * 1000 * 1000);
2812 assert_eq!(parse_memory_string("1024").unwrap(), 1024);
2813 assert_eq!(parse_memory_string("512Ki").unwrap(), 512 * 1024);
2814 }
2815
2816 #[test]
2817 fn test_parse_memory_string_errors() {
2818 assert!(parse_memory_string("").is_err());
2819 assert!(parse_memory_string("abc").is_err());
2820 assert!(parse_memory_string("12.5Mi").is_err());
2821 }
2822
2823 #[test]
2824 fn test_bundle_builder_new() {
2825 let builder = BundleBuilder::new("/tmp/test-bundle".into());
2826 assert_eq!(builder.bundle_dir(), Path::new("/tmp/test-bundle"));
2827 assert!(builder.rootfs_path.is_none());
2828 }
2829
2830 #[test]
2831 fn test_bundle_builder_for_container() {
2832 let dirs = zlayer_paths::ZLayerDirs::system_default();
2833 let id = ContainerId::new("myservice".to_string(), 1);
2834 let builder = BundleBuilder::for_container(&id);
2835 assert_eq!(builder.bundle_dir(), dirs.bundles().join("myservice-rep-1"));
2836 }
2837
2838 #[test]
2839 fn test_bundle_builder_with_rootfs() {
2840 let dirs = zlayer_paths::ZLayerDirs::system_default();
2841 let builder = BundleBuilder::new("/tmp/test-bundle".into())
2842 .with_rootfs(dirs.rootfs().join("myimage"));
2843 assert_eq!(builder.rootfs_path, Some(dirs.rootfs().join("myimage")));
2844 }
2845
2846 #[cfg(target_os = "linux")]
2847 #[tokio::test]
2848 async fn test_build_oci_spec_basic() {
2849 let id = ContainerId::new("test".to_string(), 1);
2850 let spec = mock_spec();
2851 let builder = BundleBuilder::new("/tmp/test-bundle".into());
2852
2853 let oci_spec = builder
2854 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
2855 .await
2856 .unwrap();
2857
2858 assert_eq!(oci_spec.version(), "1.0.2");
2859 assert!(oci_spec.root().is_some());
2860 assert_eq!(
2861 oci_spec.root().as_ref().unwrap().path(),
2862 std::path::Path::new("rootfs")
2863 );
2864 assert!(oci_spec.process().is_some());
2865 assert!(oci_spec.linux().is_some());
2866 }
2867
2868 #[cfg(target_os = "linux")]
2869 #[tokio::test]
2870 async fn test_build_oci_spec_with_resources() {
2871 let id = ContainerId::new("test".to_string(), 1);
2872 let spec = mock_spec_with_resources();
2873 let builder = BundleBuilder::new("/tmp/test-bundle".into());
2874
2875 let oci_spec = builder
2876 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
2877 .await
2878 .unwrap();
2879
2880 let linux = oci_spec.linux().as_ref().unwrap();
2882 let resources = linux.resources().as_ref().unwrap();
2883
2884 let cpu = resources.cpu().as_ref().unwrap();
2886 assert_eq!(cpu.quota(), Some(50_000)); assert_eq!(cpu.period(), Some(100_000));
2888
2889 let memory = resources.memory().as_ref().unwrap();
2891 assert_eq!(memory.limit(), Some(512 * 1024 * 1024)); }
2893
2894 #[cfg(target_os = "linux")]
2895 #[tokio::test]
2896 async fn test_build_oci_spec_privileged() {
2897 let id = ContainerId::new("test".to_string(), 1);
2898 let spec = mock_privileged_spec();
2899 let builder = BundleBuilder::new("/tmp/test-bundle".into());
2900
2901 let oci_spec = builder
2902 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
2903 .await
2904 .unwrap();
2905
2906 let process = oci_spec.process().as_ref().unwrap();
2908 let caps = process.capabilities().as_ref().unwrap();
2909 let bounding = caps.bounding().as_ref().unwrap();
2910
2911 assert!(bounding.contains(&Capability::SysAdmin));
2913 assert!(bounding.contains(&Capability::NetAdmin));
2914
2915 let linux = oci_spec.linux().as_ref().unwrap();
2917 assert!(
2918 linux.masked_paths().is_none() || linux.masked_paths().as_ref().unwrap().is_empty()
2919 );
2920 }
2921
2922 #[cfg(target_os = "linux")]
2923 #[tokio::test]
2924 async fn test_build_oci_spec_environment() {
2925 let id = ContainerId::new("test".to_string(), 1);
2926 let spec = mock_spec_with_resources();
2927 let builder = BundleBuilder::new("/tmp/test-bundle".into())
2928 .with_env("EXTRA_VAR".to_string(), "extra_value".to_string());
2929
2930 let oci_spec = builder
2931 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
2932 .await
2933 .unwrap();
2934
2935 let process = oci_spec.process().as_ref().unwrap();
2936 let env = process.env().as_ref().unwrap();
2937
2938 assert!(env.iter().any(|e| e == "MY_VAR=my_value"));
2940 assert!(env.iter().any(|e| e == "ANOTHER=value2"));
2941 assert!(env.iter().any(|e| e == "EXTRA_VAR=extra_value"));
2943 assert!(env.iter().any(|e| e.starts_with("PATH=")));
2945 }
2946
2947 #[cfg(target_os = "linux")]
2948 #[tokio::test]
2949 async fn test_build_namespaces() {
2950 let id = ContainerId::new("test".to_string(), 1);
2951 let spec = mock_spec();
2952 let builder = BundleBuilder::new("/tmp/test-bundle".into());
2953
2954 let oci_spec = builder
2955 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
2956 .await
2957 .unwrap();
2958 let linux = oci_spec.linux().as_ref().unwrap();
2959 let namespaces = linux.namespaces().as_ref().unwrap();
2960
2961 let namespace_types: Vec<_> = namespaces
2963 .iter()
2964 .map(oci_spec::runtime::LinuxNamespace::typ)
2965 .collect();
2966 assert!(namespace_types.contains(&LinuxNamespaceType::Pid));
2967 assert!(namespace_types.contains(&LinuxNamespaceType::Ipc));
2968 assert!(namespace_types.contains(&LinuxNamespaceType::Uts));
2969 assert!(namespace_types.contains(&LinuxNamespaceType::Mount));
2970 assert!(namespace_types.contains(&LinuxNamespaceType::Network));
2971 }
2972
2973 #[cfg(target_os = "linux")]
2974 #[tokio::test]
2975 async fn test_build_namespaces_host_network() {
2976 let id = ContainerId::new("test".to_string(), 1);
2977 let spec = mock_spec();
2978 let builder = BundleBuilder::new("/tmp/test-bundle".into()).with_host_network(true);
2979
2980 let oci_spec = builder
2981 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
2982 .await
2983 .unwrap();
2984 let linux = oci_spec.linux().as_ref().unwrap();
2985 let namespaces = linux.namespaces().as_ref().unwrap();
2986
2987 let namespace_types: Vec<_> = namespaces
2989 .iter()
2990 .map(oci_spec::runtime::LinuxNamespace::typ)
2991 .collect();
2992 assert!(namespace_types.contains(&LinuxNamespaceType::Pid));
2993 assert!(namespace_types.contains(&LinuxNamespaceType::Ipc));
2994 assert!(namespace_types.contains(&LinuxNamespaceType::Uts));
2995 assert!(namespace_types.contains(&LinuxNamespaceType::Mount));
2996 assert!(
2997 !namespace_types.contains(&LinuxNamespaceType::Network),
2998 "Network namespace should NOT be present in host_network mode"
2999 );
3000 }
3001
3002 #[test]
3003 fn test_build_default_mounts() {
3004 let spec = mock_spec();
3005 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3006
3007 let mounts = builder.build_default_mounts(&spec).unwrap();
3008
3009 let mount_destinations: Vec<_> = mounts
3011 .iter()
3012 .map(|m| m.destination().to_string_lossy().to_string())
3013 .collect();
3014 assert!(mount_destinations.contains(&"/proc".to_string()));
3015 assert!(mount_destinations.contains(&"/dev".to_string()));
3016 assert!(mount_destinations.contains(&"/dev/pts".to_string()));
3017 assert!(mount_destinations.contains(&"/dev/shm".to_string()));
3018 assert!(mount_destinations.contains(&"/sys".to_string()));
3019 }
3020
3021 #[test]
3022 fn test_build_storage_mounts_bind() {
3023 let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3024 r"
3025version: v1
3026deployment: test
3027services:
3028 test:
3029 image:
3030 name: test:latest
3031 storage:
3032 - type: bind
3033 source: /host/data
3034 target: /app/data
3035 readonly: true
3036",
3037 )
3038 .unwrap()
3039 .services
3040 .remove("test")
3041 .unwrap();
3042
3043 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3044 let volume_paths = std::collections::HashMap::new();
3045
3046 let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3047
3048 assert_eq!(mounts.len(), 1);
3049 assert_eq!(mounts[0].destination().to_string_lossy(), "/app/data");
3050 assert_eq!(
3051 mounts[0]
3052 .source()
3053 .as_ref()
3054 .map(|s| s.to_string_lossy().to_string()),
3055 Some("/host/data".to_string())
3056 );
3057 let options = mounts[0].options().as_ref().unwrap();
3058 assert!(options.contains(&"rbind".to_string()));
3059 assert!(options.contains(&"ro".to_string()));
3060 }
3061
3062 #[test]
3063 fn test_build_storage_mounts_named() {
3064 let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3065 r"
3066version: v1
3067deployment: test
3068services:
3069 test:
3070 image:
3071 name: test:latest
3072 storage:
3073 - type: named
3074 name: my-volume
3075 target: /app/data
3076",
3077 )
3078 .unwrap()
3079 .services
3080 .remove("test")
3081 .unwrap();
3082
3083 let dirs = zlayer_paths::ZLayerDirs::system_default();
3084 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3085 let mut volume_paths = std::collections::HashMap::new();
3086 volume_paths.insert("my-volume".to_string(), dirs.volumes().join("my-volume"));
3087
3088 let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3089
3090 assert_eq!(mounts.len(), 1);
3091 assert_eq!(mounts[0].destination().to_string_lossy(), "/app/data");
3092 assert_eq!(
3093 mounts[0]
3094 .source()
3095 .as_ref()
3096 .map(|s| s.to_string_lossy().to_string()),
3097 Some(
3098 dirs.volumes()
3099 .join("my-volume")
3100 .to_string_lossy()
3101 .into_owned()
3102 )
3103 );
3104 }
3105
3106 #[test]
3107 fn test_build_storage_mounts_tmpfs() {
3108 let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3109 r"
3110version: v1
3111deployment: test
3112services:
3113 test:
3114 image:
3115 name: test:latest
3116 storage:
3117 - type: tmpfs
3118 target: /app/tmp
3119 size: 256Mi
3120 mode: 1777
3121",
3122 )
3123 .unwrap()
3124 .services
3125 .remove("test")
3126 .unwrap();
3127
3128 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3129 let volume_paths = std::collections::HashMap::new();
3130
3131 let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3132
3133 assert_eq!(mounts.len(), 1);
3134 assert_eq!(mounts[0].destination().to_string_lossy(), "/app/tmp");
3135 assert_eq!(mounts[0].typ().as_ref().map(String::as_str), Some("tmpfs"));
3136 let options = mounts[0].options().as_ref().unwrap();
3137 assert!(options.iter().any(|o| o.starts_with("size=")));
3138 assert!(options.iter().any(|o| o.starts_with("mode=")));
3139 }
3140
3141 #[test]
3142 fn test_build_storage_mounts_multiple() {
3143 let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3144 r"
3145version: v1
3146deployment: test
3147services:
3148 test:
3149 image:
3150 name: test:latest
3151 storage:
3152 - type: bind
3153 source: /etc/config
3154 target: /app/config
3155 readonly: true
3156 - type: named
3157 name: app-data
3158 target: /app/data
3159 - type: tmpfs
3160 target: /app/tmp
3161",
3162 )
3163 .unwrap()
3164 .services
3165 .remove("test")
3166 .unwrap();
3167
3168 let dirs = zlayer_paths::ZLayerDirs::system_default();
3169 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3170 let mut volume_paths = std::collections::HashMap::new();
3171 volume_paths.insert("app-data".to_string(), dirs.volumes().join("app-data"));
3172
3173 let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3174
3175 assert_eq!(mounts.len(), 3);
3176
3177 let destinations: Vec<String> = mounts
3179 .iter()
3180 .map(|m| m.destination().to_string_lossy().to_string())
3181 .collect();
3182 assert!(destinations.contains(&"/app/config".to_string()));
3183 assert!(destinations.contains(&"/app/data".to_string()));
3184 assert!(destinations.contains(&"/app/tmp".to_string()));
3185 }
3186
3187 #[test]
3188 fn test_build_storage_mounts_anonymous_missing_path() {
3189 let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3190 r"
3191version: v1
3192deployment: test
3193services:
3194 test:
3195 image:
3196 name: test:latest
3197 storage:
3198 - type: anonymous
3199 target: /app/cache
3200",
3201 )
3202 .unwrap()
3203 .services
3204 .remove("test")
3205 .unwrap();
3206
3207 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3208 let volume_paths = std::collections::HashMap::new(); let result = builder.build_storage_mounts(&spec, &volume_paths);
3211
3212 assert!(result.is_err());
3214 }
3215
3216 #[cfg(target_os = "linux")]
3217 #[tokio::test]
3218 async fn test_oci_spec_includes_storage_mounts() {
3219 let id = ContainerId::new("test".to_string(), 1);
3220 let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3221 r"
3222version: v1
3223deployment: test
3224services:
3225 test:
3226 image:
3227 name: test:latest
3228 storage:
3229 - type: bind
3230 source: /host/data
3231 target: /app/data
3232 - type: tmpfs
3233 target: /app/tmp
3234",
3235 )
3236 .unwrap()
3237 .services
3238 .remove("test")
3239 .unwrap();
3240
3241 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3242 let volume_paths = std::collections::HashMap::new();
3243
3244 let oci_spec = builder
3245 .build_spec_only(&id, &spec, &volume_paths)
3246 .await
3247 .unwrap();
3248
3249 let mounts = oci_spec.mounts().as_ref().unwrap();
3251 let destinations: Vec<String> = mounts
3252 .iter()
3253 .map(|m| m.destination().to_string_lossy().to_string())
3254 .collect();
3255
3256 assert!(destinations.contains(&"/proc".to_string())); assert!(destinations.contains(&"/dev".to_string())); assert!(destinations.contains(&"/app/data".to_string())); assert!(destinations.contains(&"/app/tmp".to_string())); }
3262
3263 fn mock_gpu_spec(vendor: &str, count: u32) -> ServiceSpec {
3264 let yaml = format!(
3265 "
3266version: v1
3267deployment: test
3268services:
3269 test:
3270 rtype: service
3271 image:
3272 name: test:latest
3273 resources:
3274 gpu:
3275 count: {count}
3276 vendor: {vendor}
3277 endpoints:
3278 - name: http
3279 protocol: http
3280 port: 8080
3281"
3282 );
3283 serde_yaml::from_str::<DeploymentSpec>(&yaml)
3284 .unwrap()
3285 .services
3286 .remove("test")
3287 .unwrap()
3288 }
3289
3290 fn write_nvidia_cdi_fixture(dir: &std::path::Path, json: &str) {
3291 std::fs::write(dir.join("nvidia.json"), json).unwrap();
3292 }
3293
3294 fn nvidia_cdi_fixture() -> &'static str {
3295 r#"{
3296 "cdiVersion": "0.6.0",
3297 "kind": "nvidia.com/gpu",
3298 "devices": [{
3299 "name": "0",
3300 "containerEdits": {
3301 "deviceNodes": [
3302 {"path": "/dev/nvidia0", "type": "c", "major": 195, "minor": 0}
3303 ],
3304 "env": ["NVIDIA_VISIBLE_DEVICES=0"],
3305 "hooks": {
3306 "createContainer": [{
3307 "path": "/usr/bin/nvidia-container-runtime-hook",
3308 "args": ["nvidia-container-runtime-hook", "prestart"]
3309 }]
3310 }
3311 }
3312 }]
3313 }"#
3314 }
3315
3316 #[cfg(target_os = "linux")]
3317 #[tokio::test]
3318 async fn gpu_spec_translates_to_cdi_device_nodes() {
3319 let dir = tempfile::tempdir().unwrap();
3320 write_nvidia_cdi_fixture(dir.path(), nvidia_cdi_fixture());
3321 let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3322
3323 let id = ContainerId::new("test".to_string(), 1);
3324 let spec = mock_gpu_spec("nvidia", 1);
3325 let builder = BundleBuilder::new("/tmp/test-bundle-cdi".into()).with_cdi_registry(registry);
3326
3327 let oci_spec = builder
3328 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3329 .await
3330 .expect("build with CDI fixture");
3331
3332 let linux = oci_spec.linux().as_ref().expect("linux config present");
3334 let devices = linux.devices().as_ref().expect("devices present");
3335 assert!(
3336 devices
3337 .iter()
3338 .any(|d| d.path() == std::path::Path::new("/dev/nvidia0")),
3339 "expected /dev/nvidia0 from CDI fixture; got {:?}",
3340 devices
3341 .iter()
3342 .map(oci_spec::runtime::LinuxDevice::path)
3343 .collect::<Vec<_>>()
3344 );
3345
3346 let process = oci_spec.process().as_ref().expect("process present");
3348 let env = process.env().as_ref().expect("env present");
3349 assert!(
3350 env.iter().any(|e| e == "NVIDIA_VISIBLE_DEVICES=0"),
3351 "expected NVIDIA_VISIBLE_DEVICES=0 in env; got {env:?}"
3352 );
3353
3354 let hooks = oci_spec.hooks().as_ref().expect("hooks present");
3356 let create_container = hooks
3357 .create_container()
3358 .as_ref()
3359 .expect("createContainer hooks present");
3360 assert_eq!(create_container.len(), 1);
3361 assert_eq!(
3362 create_container[0].path(),
3363 &std::path::PathBuf::from("/usr/bin/nvidia-container-runtime-hook")
3364 );
3365 }
3366
3367 #[tokio::test]
3368 async fn gpu_spec_with_missing_cdi_returns_error() {
3369 let dir = tempfile::tempdir().unwrap();
3371 let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3372
3373 let id = ContainerId::new("test".to_string(), 1);
3374 let spec = mock_gpu_spec("nvidia", 1);
3375 let builder =
3376 BundleBuilder::new("/tmp/test-bundle-cdi-missing".into()).with_cdi_registry(registry);
3377
3378 let err = builder
3379 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3380 .await
3381 .expect_err("should fail when CDI registry is empty");
3382
3383 match err {
3384 AgentError::InvalidSpec(msg) => {
3385 assert!(
3386 msg.contains("nvidia") || msg.contains("CDI"),
3387 "error should mention CDI / vendor; got: {msg}"
3388 );
3389 }
3390 other => panic!("expected InvalidSpec, got {other:?}"),
3391 }
3392 }
3393
3394 #[tokio::test]
3395 async fn gpu_spec_with_unknown_device_returns_error() {
3396 let dir = tempfile::tempdir().unwrap();
3399 write_nvidia_cdi_fixture(dir.path(), nvidia_cdi_fixture());
3400 let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3401
3402 let id = ContainerId::new("test".to_string(), 1);
3403 let spec = mock_gpu_spec("nvidia", 2);
3404 let builder =
3405 BundleBuilder::new("/tmp/test-bundle-cdi-unknown".into()).with_cdi_registry(registry);
3406
3407 let err = builder
3408 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3409 .await
3410 .expect_err("should fail when device '1' is not declared");
3411 match err {
3412 AgentError::InvalidSpec(msg) => {
3413 assert!(
3414 msg.contains("'1'") || msg.contains("device"),
3415 "error should mention the missing device; got: {msg}"
3416 );
3417 }
3418 other => panic!("expected InvalidSpec, got {other:?}"),
3419 }
3420 }
3421
3422 #[cfg(target_os = "linux")]
3423 #[tokio::test]
3424 async fn gpu_spec_with_all_devices_expands_to_all_in_spec() {
3425 let dir = tempfile::tempdir().unwrap();
3427 let fixture = r#"{
3428 "cdiVersion": "0.6.0",
3429 "kind": "nvidia.com/gpu",
3430 "devices": [
3431 {
3432 "name": "0",
3433 "containerEdits": {
3434 "env": ["NVIDIA_VISIBLE_DEVICES=0"],
3435 "deviceNodes": [
3436 {"path": "/dev/nvidia0", "type": "c", "major": 195, "minor": 0}
3437 ]
3438 }
3439 },
3440 {
3441 "name": "1",
3442 "containerEdits": {
3443 "env": ["NVIDIA_VISIBLE_DEVICES=1"],
3444 "deviceNodes": [
3445 {"path": "/dev/nvidia1", "type": "c", "major": 195, "minor": 1}
3446 ]
3447 }
3448 }
3449 ]
3450 }"#;
3451 write_nvidia_cdi_fixture(dir.path(), fixture);
3452 let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3453
3454 let edits = registry
3457 .resolve_for_kind("nvidia.com/gpu", &["all".to_string()])
3458 .expect("resolve all");
3459 assert_eq!(edits.len(), 2);
3460
3461 let id = ContainerId::new("test".to_string(), 1);
3464 let spec = mock_gpu_spec("nvidia", 2);
3465 let builder =
3466 BundleBuilder::new("/tmp/test-bundle-cdi-all".into()).with_cdi_registry(registry);
3467
3468 let oci_spec = builder
3469 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3470 .await
3471 .expect("build with 2-device fixture");
3472
3473 let devices = oci_spec
3474 .linux()
3475 .as_ref()
3476 .unwrap()
3477 .devices()
3478 .as_ref()
3479 .expect("devices present");
3480 let paths: Vec<_> = devices.iter().map(|d| d.path().clone()).collect();
3481 assert!(paths.contains(&std::path::PathBuf::from("/dev/nvidia0")));
3482 assert!(paths.contains(&std::path::PathBuf::from("/dev/nvidia1")));
3483 }
3484
3485 fn build_nvidia_cdi_registry(dir: &std::path::Path) -> std::sync::Arc<crate::cdi::CdiRegistry> {
3490 write_nvidia_cdi_fixture(dir, nvidia_cdi_fixture());
3491 std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir]))
3492 }
3493
3494 #[cfg(target_os = "linux")]
3495 #[tokio::test]
3496 async fn gpu_spec_with_mps_sharing_injects_env_and_mounts() {
3497 let cdi_dir = tempfile::tempdir().unwrap();
3501 let mps_root = tempfile::tempdir().unwrap();
3502 let pipe_dir = mps_root.path().join("nvidia-mps");
3503 let log_dir = mps_root.path().join("nvidia-log");
3504 std::fs::create_dir(&pipe_dir).unwrap();
3505 std::fs::create_dir(&log_dir).unwrap();
3506 let registry = build_nvidia_cdi_registry(cdi_dir.path());
3507
3508 let id = ContainerId::new("test".to_string(), 1);
3509 let mut spec = mock_gpu_spec("nvidia", 1);
3510 let gpu = spec.resources.gpu.as_mut().expect("gpu spec set");
3511 gpu.sharing = Some(zlayer_spec::GpuSharingMode::Mps);
3512 gpu.mps_pipe_dir = Some(pipe_dir.to_string_lossy().into_owned());
3513 gpu.mps_log_dir = Some(log_dir.to_string_lossy().into_owned());
3514
3515 let builder =
3516 BundleBuilder::new("/tmp/test-bundle-mps-env".into()).with_cdi_registry(registry);
3517 let oci_spec = builder
3518 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3519 .await
3520 .expect("build with MPS sharing");
3521
3522 let env = oci_spec
3523 .process()
3524 .as_ref()
3525 .and_then(|p| p.env().as_ref())
3526 .expect("env present");
3527 let pipe_expect = format!("CUDA_MPS_PIPE_DIRECTORY={}", pipe_dir.display());
3528 let log_expect = format!("CUDA_MPS_LOG_DIRECTORY={}", log_dir.display());
3529 assert!(
3530 env.iter().any(|e| e == &pipe_expect),
3531 "expected {pipe_expect} in env; got {env:?}"
3532 );
3533 assert!(
3534 env.iter().any(|e| e == &log_expect),
3535 "expected {log_expect} in env; got {env:?}"
3536 );
3537
3538 let mounts = oci_spec.mounts().as_ref().expect("mounts present");
3539 assert!(
3540 mounts
3541 .iter()
3542 .any(|m| m.destination() == &pipe_dir && m.source().as_ref() == Some(&pipe_dir)),
3543 "expected bind mount of MPS pipe dir {}; got destinations {:?}",
3544 pipe_dir.display(),
3545 mounts.iter().map(Mount::destination).collect::<Vec<_>>()
3546 );
3547 assert!(
3548 mounts
3549 .iter()
3550 .any(|m| m.destination() == &log_dir && m.source().as_ref() == Some(&log_dir)),
3551 "expected bind mount of MPS log dir {}",
3552 log_dir.display()
3553 );
3554 }
3555
3556 #[tokio::test]
3557 async fn gpu_spec_with_mps_sharing_fails_when_pipe_dir_missing() {
3558 let cdi_dir = tempfile::tempdir().unwrap();
3559 let registry = build_nvidia_cdi_registry(cdi_dir.path());
3560
3561 let id = ContainerId::new("test".to_string(), 1);
3562 let mut spec = mock_gpu_spec("nvidia", 1);
3563 let gpu = spec.resources.gpu.as_mut().expect("gpu spec set");
3564 gpu.sharing = Some(zlayer_spec::GpuSharingMode::Mps);
3565 let missing = tempfile::tempdir().unwrap();
3568 let missing_path = missing.path().join("definitely-not-here");
3569 gpu.mps_pipe_dir = Some(missing_path.to_string_lossy().into_owned());
3570
3571 let builder =
3572 BundleBuilder::new("/tmp/test-bundle-mps-missing".into()).with_cdi_registry(registry);
3573 let err = builder
3574 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3575 .await
3576 .expect_err("should fail when MPS pipe dir is missing");
3577 match err {
3578 AgentError::GpuSharingUnavailable { mode, reason } => {
3579 assert_eq!(mode, "mps");
3580 assert!(
3581 reason.contains("pipe") || reason.contains(&missing_path.display().to_string()),
3582 "reason should mention the missing path; got: {reason}"
3583 );
3584 }
3585 other => panic!("expected GpuSharingUnavailable, got {other:?}"),
3586 }
3587 }
3588
3589 #[cfg(target_os = "linux")]
3590 #[tokio::test]
3591 async fn gpu_spec_with_timeslicing_injects_visible_devices() {
3592 let cdi_dir = tempfile::tempdir().unwrap();
3593 let registry = build_nvidia_cdi_registry(cdi_dir.path());
3594
3595 let id = ContainerId::new("test".to_string(), 1);
3596 let mut spec = mock_gpu_spec("nvidia", 1);
3597 let gpu = spec.resources.gpu.as_mut().expect("gpu spec set");
3598 gpu.sharing = Some(zlayer_spec::GpuSharingMode::TimeSlice);
3599 gpu.time_slice_index = Some(2);
3600
3601 let builder =
3602 BundleBuilder::new("/tmp/test-bundle-timeslice".into()).with_cdi_registry(registry);
3603 let oci_spec = builder
3604 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3605 .await
3606 .expect("build with time-slicing");
3607
3608 let env = oci_spec
3609 .process()
3610 .as_ref()
3611 .and_then(|p| p.env().as_ref())
3612 .expect("env present");
3613 let cuda_entries: Vec<&String> = env
3616 .iter()
3617 .filter(|e| e.starts_with("CUDA_VISIBLE_DEVICES="))
3618 .collect();
3619 assert_eq!(
3620 cuda_entries.len(),
3621 1,
3622 "exactly one CUDA_VISIBLE_DEVICES expected; got {cuda_entries:?}"
3623 );
3624 assert_eq!(cuda_entries[0], "CUDA_VISIBLE_DEVICES=2");
3625 }
3626
3627 #[cfg(target_os = "linux")]
3628 #[tokio::test]
3629 async fn gpu_spec_without_sharing_omits_mps_env() {
3630 let cdi_dir = tempfile::tempdir().unwrap();
3631 let registry = build_nvidia_cdi_registry(cdi_dir.path());
3632
3633 let id = ContainerId::new("test".to_string(), 1);
3634 let spec = mock_gpu_spec("nvidia", 1);
3635 assert!(spec.resources.gpu.as_ref().unwrap().sharing.is_none());
3636
3637 let builder =
3638 BundleBuilder::new("/tmp/test-bundle-no-sharing".into()).with_cdi_registry(registry);
3639 let oci_spec = builder
3640 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3641 .await
3642 .expect("build without sharing");
3643
3644 let env = oci_spec
3645 .process()
3646 .as_ref()
3647 .and_then(|p| p.env().as_ref())
3648 .expect("env present");
3649 assert!(
3650 !env.iter().any(|e| e.starts_with("CUDA_MPS_")),
3651 "no CUDA_MPS_* env should be present without sharing; got {env:?}"
3652 );
3653
3654 let mounts = oci_spec.mounts().as_ref().expect("mounts present");
3658 assert!(
3659 !mounts
3660 .iter()
3661 .any(|m| { m.destination().to_string_lossy().contains("nvidia-mps") }),
3662 "no MPS pipe mount should be present without sharing"
3663 );
3664 }
3665
3666 #[cfg(unix)]
3667 mod subid_tests {
3668 use super::super::read_subid_range;
3669 use std::io::Write;
3670
3671 #[test]
3672 fn read_subid_range_returns_range_for_user() {
3673 let mut tmp = tempfile::NamedTempFile::new().unwrap();
3674 writeln!(tmp, "alice:100000:65536").unwrap();
3675 writeln!(tmp, "bob:165536:65536").unwrap();
3676 tmp.flush().unwrap();
3677 let path = tmp.path().to_str().unwrap();
3678 assert_eq!(read_subid_range(path, "bob"), Some((165_536, 65_536)));
3679 assert_eq!(read_subid_range(path, "alice"), Some((100_000, 65_536)));
3680 }
3681
3682 #[test]
3683 fn read_subid_range_returns_none_for_unknown_user() {
3684 let mut tmp = tempfile::NamedTempFile::new().unwrap();
3685 writeln!(tmp, "alice:100000:65536").unwrap();
3686 tmp.flush().unwrap();
3687 assert_eq!(
3688 read_subid_range(tmp.path().to_str().unwrap(), "carol"),
3689 None
3690 );
3691 }
3692
3693 #[test]
3694 fn read_subid_range_returns_none_on_missing_file() {
3695 assert_eq!(
3696 read_subid_range("/this/path/does/not/exist/subuid", "anyone"),
3697 None
3698 );
3699 }
3700 }
3701}