1use crate::cdi::{self, CdiContainerEdits, CdiRegistry};
9use crate::error::{AgentError, Result};
10use crate::runtime::ContainerId;
11use oci_spec::runtime::{
12 Capability, Hook, HookBuilder, Hooks, HooksBuilder, LinuxBuilder, LinuxCapabilitiesBuilder,
13 LinuxCpuBuilder, LinuxDeviceBuilder, LinuxDeviceCgroupBuilder, LinuxDeviceType,
14 LinuxMemoryBuilder, LinuxNamespaceBuilder, LinuxNamespaceType, LinuxResourcesBuilder, Mount,
15 MountBuilder, ProcessBuilder, RootBuilder, Spec, SpecBuilder, UserBuilder,
16};
17#[cfg(unix)]
20use oci_spec::runtime::LinuxIdMappingBuilder;
21use std::collections::{HashMap, HashSet};
22use std::path::{Path, PathBuf};
31use std::str::FromStr;
32use std::sync::Arc;
33use tokio::fs;
34use zlayer_secrets::SecretsProvider;
35use zlayer_spec::{GpuSharingMode, ServiceSpec, StorageSpec, StorageTier};
36
37const DEFAULT_MPS_PIPE_DIR: &str = "/tmp/nvidia-mps";
40
41const DEFAULT_MPS_LOG_DIR: &str = "/tmp/nvidia-log";
44
45const TIMESLICE_CONFIG_CONTAINER_PATH: &str = "/etc/nvidia/gpu-time-slicing.yaml";
50
51struct MpsDirs {
57 pipe_dir: PathBuf,
58 log_dir: PathBuf,
59}
60
61fn resolve_mps_dirs(gpu: &zlayer_spec::GpuSpec) -> Result<Option<MpsDirs>> {
71 if gpu.sharing != Some(GpuSharingMode::Mps) {
72 return Ok(None);
73 }
74
75 let pipe_dir = PathBuf::from(gpu.mps_pipe_dir.as_deref().unwrap_or(DEFAULT_MPS_PIPE_DIR));
76 let log_dir = PathBuf::from(gpu.mps_log_dir.as_deref().unwrap_or(DEFAULT_MPS_LOG_DIR));
77
78 if !pipe_dir.is_dir() {
79 return Err(AgentError::GpuSharingUnavailable {
80 mode: "mps".to_string(),
81 reason: format!(
82 "MPS pipe directory {} does not exist; ensure nvidia-cuda-mps-control is running",
83 pipe_dir.display()
84 ),
85 });
86 }
87 if !log_dir.is_dir() {
88 return Err(AgentError::GpuSharingUnavailable {
89 mode: "mps".to_string(),
90 reason: format!(
91 "MPS log directory {} does not exist; ensure nvidia-cuda-mps-control is running",
92 log_dir.display()
93 ),
94 });
95 }
96
97 Ok(Some(MpsDirs { pipe_dir, log_dir }))
98}
99
100fn cdi_node_to_oci_device(
109 node: &crate::cdi::CdiDeviceNode,
110) -> Result<oci_spec::runtime::LinuxDevice> {
111 let host_path = node.host_path.as_deref().unwrap_or(&node.path);
112
113 let dev_type = match node.device_type.as_deref() {
114 Some("c" | "u") => LinuxDeviceType::C,
115 Some("b") => LinuxDeviceType::B,
116 Some("p") => LinuxDeviceType::P,
117 _ => get_device_type(host_path).unwrap_or(LinuxDeviceType::C),
118 };
119
120 let (major, minor) = if let (Some(maj), Some(min)) = (node.major, node.minor) {
121 (maj, min)
122 } else {
123 get_device_major_minor(host_path).unwrap_or((0, 0))
124 };
125
126 let mut builder = LinuxDeviceBuilder::default()
127 .path(node.path.clone())
128 .typ(dev_type)
129 .major(major)
130 .minor(minor);
131 if let Some(mode) = node.file_mode {
132 builder = builder.file_mode(mode);
133 } else {
134 builder = builder.file_mode(0o666u32);
135 }
136 builder = builder.uid(node.uid.unwrap_or(0));
137 builder = builder.gid(node.gid.unwrap_or(0));
138
139 builder.build().map_err(|e| {
140 AgentError::InvalidSpec(format!(
141 "failed to build CDI device {path}: {e}",
142 path = node.path
143 ))
144 })
145}
146
147fn convert_cdi_hook(cdi_hook: &crate::cdi::CdiHook) -> Result<Hook> {
149 let mut builder = HookBuilder::default().path(PathBuf::from(&cdi_hook.path));
150 if !cdi_hook.args.is_empty() {
151 builder = builder.args(cdi_hook.args.clone());
152 }
153 if !cdi_hook.env.is_empty() {
154 builder = builder.env(cdi_hook.env.clone());
155 }
156 builder
157 .build()
158 .map_err(|e| AgentError::InvalidSpec(format!("failed to build CDI hook: {e}")))
159}
160
161const ALL_CAPABILITIES: &[Capability] = &[
163 Capability::AuditControl,
164 Capability::AuditRead,
165 Capability::AuditWrite,
166 Capability::BlockSuspend,
167 Capability::Bpf,
168 Capability::CheckpointRestore,
169 Capability::Chown,
170 Capability::DacOverride,
171 Capability::DacReadSearch,
172 Capability::Fowner,
173 Capability::Fsetid,
174 Capability::IpcLock,
175 Capability::IpcOwner,
176 Capability::Kill,
177 Capability::Lease,
178 Capability::LinuxImmutable,
179 Capability::MacAdmin,
180 Capability::MacOverride,
181 Capability::Mknod,
182 Capability::NetAdmin,
183 Capability::NetBindService,
184 Capability::NetBroadcast,
185 Capability::NetRaw,
186 Capability::Perfmon,
187 Capability::Setfcap,
188 Capability::Setgid,
189 Capability::Setpcap,
190 Capability::Setuid,
191 Capability::SysAdmin,
192 Capability::SysBoot,
193 Capability::SysChroot,
194 Capability::SysModule,
195 Capability::SysNice,
196 Capability::SysPacct,
197 Capability::SysPtrace,
198 Capability::SysRawio,
199 Capability::SysResource,
200 Capability::SysTime,
201 Capability::SysTtyConfig,
202 Capability::Syslog,
203 Capability::WakeAlarm,
204];
205
206#[must_use]
236pub fn generate_resolv_conf(nameservers: &[String]) -> String {
237 let mut out = String::new();
238 for ns in nameservers {
239 out.push_str("nameserver ");
240 out.push_str(ns);
241 out.push('\n');
242 }
243 out.push_str("options edns0\n");
244 out
245}
246
247pub fn parse_memory_string(s: &str) -> std::result::Result<u64, String> {
250 let s = s.trim();
251 if s.is_empty() {
252 return Err("empty memory string".to_string());
253 }
254
255 let (num_str, multiplier) = if let Some(n) = s.strip_suffix("Ki") {
256 (n, 1024u64)
257 } else if let Some(n) = s.strip_suffix("Mi") {
258 (n, 1024u64 * 1024)
259 } else if let Some(n) = s.strip_suffix("Gi") {
260 (n, 1024u64 * 1024 * 1024)
261 } else if let Some(n) = s.strip_suffix("Ti") {
262 (n, 1024u64 * 1024 * 1024 * 1024)
263 } else if let Some(n) = s.strip_suffix('K').or_else(|| s.strip_suffix('k')) {
264 (n, 1000u64)
265 } else if let Some(n) = s.strip_suffix('M').or_else(|| s.strip_suffix('m')) {
266 (n, 1000u64 * 1000)
267 } else if let Some(n) = s.strip_suffix('G').or_else(|| s.strip_suffix('g')) {
268 (n, 1000u64 * 1000 * 1000)
269 } else if let Some(n) = s.strip_suffix('T').or_else(|| s.strip_suffix('t')) {
270 (n, 1000u64 * 1000 * 1000 * 1000)
271 } else {
272 (s, 1u64)
273 };
274
275 let num: u64 = num_str
276 .parse()
277 .map_err(|e| format!("invalid number: {e}"))?;
278
279 Ok(num * multiplier)
280}
281
282#[cfg(unix)]
291#[allow(clippy::cast_possible_wrap)]
292fn get_device_major_minor(path: &str) -> std::io::Result<(i64, i64)> {
293 use std::os::unix::fs::MetadataExt;
294 let metadata = std::fs::metadata(path)?;
295 let rdev = metadata.rdev();
296 let major = ((rdev >> 8) & 0xff) as i64;
298 let minor = (rdev & 0xff) as i64;
299 Ok((major, minor))
300}
301
302#[cfg(not(unix))]
304fn get_device_major_minor(_path: &str) -> std::io::Result<(i64, i64)> {
305 Err(std::io::Error::new(
306 std::io::ErrorKind::Unsupported,
307 "device-cgroup probes require Unix",
308 ))
309}
310
311#[cfg(unix)]
316fn get_device_type(path: &str) -> std::io::Result<LinuxDeviceType> {
317 use std::os::unix::fs::FileTypeExt;
318 let metadata = std::fs::metadata(path)?;
319 let file_type = metadata.file_type();
320 if file_type.is_char_device() {
321 Ok(LinuxDeviceType::C)
322 } else if file_type.is_block_device() {
323 Ok(LinuxDeviceType::B)
324 } else {
325 Ok(LinuxDeviceType::U) }
327}
328
329#[cfg(not(unix))]
331fn get_device_type(_path: &str) -> std::io::Result<LinuxDeviceType> {
332 Err(std::io::Error::new(
333 std::io::ErrorKind::Unsupported,
334 "device-cgroup probes require Unix",
335 ))
336}
337
338#[derive(Clone)]
352pub struct BundleBuilder {
353 bundle_dir: PathBuf,
355 rootfs_path: Option<PathBuf>,
357 hostname: Option<String>,
359 extra_env: Vec<(String, String)>,
361 cwd: Option<String>,
363 args: Option<Vec<String>>,
365 volume_paths: HashMap<String, PathBuf>,
367 image_config: Option<zlayer_registry::ImageConfig>,
369 host_network: bool,
371 secrets_provider: Option<Arc<dyn SecretsProvider>>,
373 deployment_scope: Option<String>,
375 socket_path: Option<String>,
377 cdi_registry: Option<Arc<CdiRegistry>>,
384}
385
386impl std::fmt::Debug for BundleBuilder {
387 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
388 f.debug_struct("BundleBuilder")
389 .field("bundle_dir", &self.bundle_dir)
390 .field("rootfs_path", &self.rootfs_path)
391 .field("hostname", &self.hostname)
392 .field("extra_env", &self.extra_env)
393 .field("cwd", &self.cwd)
394 .field("args", &self.args)
395 .field("volume_paths", &self.volume_paths)
396 .field("image_config", &self.image_config)
397 .field("host_network", &self.host_network)
398 .field("secrets_provider", &self.secrets_provider.is_some())
399 .field("deployment_scope", &self.deployment_scope)
400 .field("socket_path", &self.socket_path)
401 .field("cdi_registry", &self.cdi_registry.is_some())
402 .finish()
403 }
404}
405
406#[cfg(unix)]
414fn build_rootless_id_mappings(
415 host_id: u32,
416 subid_path: &str,
417 username: &str,
418) -> Vec<oci_spec::runtime::LinuxIdMapping> {
419 let mut mappings = vec![LinuxIdMappingBuilder::default()
420 .container_id(0_u32)
421 .host_id(host_id)
422 .size(1_u32)
423 .build()
424 .unwrap()];
425 if !username.is_empty() {
426 if let Some((start, count)) = read_subid_range(subid_path, username) {
427 mappings.push(
428 LinuxIdMappingBuilder::default()
429 .container_id(1_u32)
430 .host_id(start)
431 .size(count)
432 .build()
433 .unwrap(),
434 );
435 }
436 }
437 mappings
438}
439
440#[cfg(unix)]
448fn read_subid_range(path: &str, username: &str) -> Option<(u32, u32)> {
449 let contents = std::fs::read_to_string(path).ok()?;
450 for line in contents.lines() {
451 let mut parts = line.splitn(3, ':');
452 let user = parts.next()?;
453 if user != username {
454 continue;
455 }
456 let start: u32 = parts.next()?.parse().ok()?;
457 let count: u32 = parts.next()?.parse().ok()?;
458 return Some((start, count));
459 }
460 None
461}
462
463impl BundleBuilder {
464 #[must_use]
474 pub fn new(bundle_dir: PathBuf) -> Self {
475 Self {
476 bundle_dir,
477 rootfs_path: None,
478 hostname: None,
479 extra_env: Vec::new(),
480 cwd: None,
481 args: None,
482 volume_paths: HashMap::new(),
483 image_config: None,
484 host_network: false,
485 secrets_provider: None,
486 deployment_scope: None,
487 socket_path: None,
488 cdi_registry: None,
489 }
490 }
491
492 #[must_use]
499 pub fn with_cdi_registry(mut self, registry: Arc<CdiRegistry>) -> Self {
500 self.cdi_registry = Some(registry);
501 self
502 }
503
504 #[must_use]
506 pub fn for_container(container_id: &ContainerId) -> Self {
507 let bundle_dir = zlayer_paths::ZLayerDirs::system_default()
508 .bundles()
509 .join(container_id.to_string());
510 Self::new(bundle_dir)
511 }
512
513 #[must_use]
517 pub fn with_rootfs(mut self, rootfs_path: PathBuf) -> Self {
518 self.rootfs_path = Some(rootfs_path);
519 self
520 }
521
522 #[must_use]
524 pub fn with_hostname(mut self, hostname: String) -> Self {
525 self.hostname = Some(hostname);
526 self
527 }
528
529 #[must_use]
531 pub fn with_env(mut self, key: String, value: String) -> Self {
532 self.extra_env.push((key, value));
533 self
534 }
535
536 #[must_use]
538 pub fn with_cwd(mut self, cwd: String) -> Self {
539 self.cwd = Some(cwd);
540 self
541 }
542
543 #[must_use]
545 pub fn with_args(mut self, args: Vec<String>) -> Self {
546 self.args = Some(args);
547 self
548 }
549
550 #[must_use]
555 pub fn with_volume_paths(mut self, volume_paths: HashMap<String, PathBuf>) -> Self {
556 self.volume_paths = volume_paths;
557 self
558 }
559
560 #[must_use]
565 pub fn with_image_config(mut self, config: zlayer_registry::ImageConfig) -> Self {
566 self.image_config = Some(config);
567 self
568 }
569
570 #[must_use]
576 pub fn with_host_network(mut self, host_network: bool) -> Self {
577 self.host_network = host_network;
578 self
579 }
580
581 #[must_use]
586 pub fn with_secrets_provider(mut self, provider: Arc<dyn SecretsProvider>) -> Self {
587 self.secrets_provider = Some(provider);
588 self
589 }
590
591 #[must_use]
596 pub fn with_deployment_scope(mut self, scope: String) -> Self {
597 self.deployment_scope = Some(scope);
598 self
599 }
600
601 #[must_use]
604 pub fn with_socket_mount(mut self, path: impl Into<String>) -> Self {
605 self.socket_path = Some(path.into());
606 self
607 }
608
609 #[must_use]
611 pub fn bundle_dir(&self) -> &Path {
612 &self.bundle_dir
613 }
614
615 #[cfg(unix)]
634 pub async fn build(&self, container_id: &ContainerId, spec: &ServiceSpec) -> Result<PathBuf> {
635 fs::create_dir_all(&self.bundle_dir)
637 .await
638 .map_err(|e| AgentError::CreateFailed {
639 id: container_id.to_string(),
640 reason: format!("failed to create bundle directory: {e}"),
641 })?;
642
643 let rootfs_in_bundle = self.bundle_dir.join("rootfs");
645 if let Some(ref rootfs_path) = self.rootfs_path {
646 let _ = fs::remove_file(&rootfs_in_bundle).await;
648 let _ = fs::remove_dir(&rootfs_in_bundle).await;
649
650 #[cfg(unix)]
655 tokio::fs::symlink(rootfs_path, &rootfs_in_bundle)
656 .await
657 .map_err(|e| AgentError::CreateFailed {
658 id: container_id.to_string(),
659 reason: format!(
660 "failed to symlink rootfs from {} to {}: {}",
661 rootfs_path.display(),
662 rootfs_in_bundle.display(),
663 e
664 ),
665 })?;
666
667 #[cfg(windows)]
668 tokio::fs::symlink_dir(rootfs_path, &rootfs_in_bundle)
669 .await
670 .map_err(|e| AgentError::CreateFailed {
671 id: container_id.to_string(),
672 reason: format!(
673 "failed to symlink rootfs from {} to {}: {}",
674 rootfs_path.display(),
675 rootfs_in_bundle.display(),
676 e
677 ),
678 })?;
679 } else {
680 fs::create_dir_all(&rootfs_in_bundle)
682 .await
683 .map_err(|e| AgentError::CreateFailed {
684 id: container_id.to_string(),
685 reason: format!("failed to create rootfs directory: {e}"),
686 })?;
687 }
688
689 let oci_spec = self
691 .build_spec_only(container_id, spec, &self.volume_paths)
692 .await?;
693
694 let config_path = self.bundle_dir.join("config.json");
696 let config_json =
697 serde_json::to_string_pretty(&oci_spec).map_err(|e| AgentError::CreateFailed {
698 id: container_id.to_string(),
699 reason: format!("failed to serialize OCI spec: {e}"),
700 })?;
701
702 fs::write(&config_path, config_json)
703 .await
704 .map_err(|e| AgentError::CreateFailed {
705 id: container_id.to_string(),
706 reason: format!("failed to write config.json: {e}"),
707 })?;
708
709 tracing::debug!(
710 "Created OCI bundle at {} for container {}",
711 self.bundle_dir.display(),
712 container_id
713 );
714
715 Ok(self.bundle_dir.clone())
716 }
717
718 pub async fn build_spec_only(
738 &self,
739 container_id: &ContainerId,
740 spec: &ServiceSpec,
741 volume_paths: &std::collections::HashMap<String, PathBuf>,
742 ) -> Result<oci_spec::runtime::Spec> {
743 self.build_oci_spec(container_id, spec, volume_paths).await
744 }
745
746 fn resolve_cdi_edits(&self, spec: &ServiceSpec) -> Result<Option<Vec<CdiContainerEdits>>> {
761 let Some(ref gpu) = spec.resources.gpu else {
762 return Ok(None);
763 };
764
765 let Some(kind) = cdi::vendor_to_cdi_kind(&gpu.vendor) else {
768 return Ok(None);
769 };
770
771 let (registry, strict) = if let Some(reg) = &self.cdi_registry {
777 (reg.clone(), true)
778 } else {
779 let reg = Arc::new(CdiRegistry::discover());
780 if reg.is_empty() {
781 return Ok(None);
782 }
783 (reg, false)
784 };
785
786 let device_names: Vec<String> = (0..gpu.count).map(|i| i.to_string()).collect();
787
788 match registry.resolve_for_kind(kind, &device_names) {
789 Ok(edits) => Ok(Some(edits)),
790 Err(err) => {
791 if strict {
792 Err(AgentError::InvalidSpec(format!(
793 "CDI resolution failed for vendor '{}': {err}",
794 gpu.vendor
795 )))
796 } else {
797 tracing::warn!(
798 vendor = %gpu.vendor,
799 kind = %kind,
800 error = %err,
801 "CDI resolution failed; falling back to baked-in GPU device passthrough"
802 );
803 Ok(None)
804 }
805 }
806 }
807 }
808
809 #[allow(clippy::too_many_lines)]
828 async fn build_oci_spec(
829 &self,
830 container_id: &ContainerId,
831 spec: &ServiceSpec,
832 volume_paths: &std::collections::HashMap<String, PathBuf>,
833 ) -> Result<Spec> {
834 let cdi_edits = self.resolve_cdi_edits(spec)?;
838
839 let user = {
841 let (uid, gid) = if let Some(user_str) = self
842 .image_config
843 .as_ref()
844 .and_then(|c| c.user.as_ref())
845 .filter(|u| !u.is_empty())
846 {
847 let parts: Vec<&str> = user_str.splitn(2, ':').collect();
849 let uid = parts[0].parse::<u32>().unwrap_or(0);
850 let gid = if parts.len() > 1 {
851 parts[1].parse::<u32>().unwrap_or(0)
852 } else {
853 uid
854 };
855 (uid, gid)
856 } else {
857 (0u32, 0u32)
858 };
859
860 UserBuilder::default()
861 .uid(uid)
862 .gid(gid)
863 .build()
864 .map_err(|e| AgentError::InvalidSpec(format!("failed to build user: {e}")))?
865 };
866
867 let mut env: Vec<String> = Vec::new();
870 let mut env_keys: HashSet<String> = HashSet::new();
871
872 if let Some(img_env) = self.image_config.as_ref().and_then(|c| c.env.as_ref()) {
874 for entry in img_env {
875 if let Some(key) = entry.split('=').next() {
876 env_keys.insert(key.to_string());
877 }
878 env.push(entry.clone());
879 }
880 }
881
882 if !env_keys.contains("PATH") {
884 env.push(
885 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(),
886 );
887 env_keys.insert("PATH".to_string());
888 }
889
890 if !env_keys.contains("TERM") {
892 env.push("TERM=xterm".to_string());
893 env_keys.insert("TERM".to_string());
894 }
895
896 if let (Some(secrets_provider), Some(scope)) =
903 (&self.secrets_provider, &self.deployment_scope)
904 {
905 let resolved_map =
906 crate::env::resolve_env_with_secrets(&spec.env, secrets_provider.as_ref(), scope)
907 .await
908 .map_err(|e| {
909 AgentError::InvalidSpec(format!(
910 "environment variable resolution failed: {e}"
911 ))
912 })?;
913
914 for (key, value) in &resolved_map {
915 if env_keys.contains(key.as_str()) {
916 env.retain(|e| e.split('=').next() != Some(key.as_str()));
917 }
918 env_keys.insert(key.clone());
919 env.push(format!("{key}={value}"));
920 }
921 } else {
922 let resolved = crate::env::resolve_env_vars_with_warnings(&spec.env).map_err(|e| {
923 AgentError::InvalidSpec(format!("environment variable resolution failed: {e}"))
924 })?;
925
926 for warning in &resolved.warnings {
928 tracing::warn!(container = %container_id, "{}", warning);
929 }
930
931 for var in &resolved.vars {
933 if let Some(key) = var.split('=').next() {
934 if env_keys.contains(key) {
935 env.retain(|e| e.split('=').next() != Some(key));
937 }
938 env_keys.insert(key.to_string());
939 }
940 env.push(var.clone());
941 }
942 }
943
944 for (key, value) in &self.extra_env {
946 if env_keys.contains(key.as_str()) {
947 env.retain(|e| e.split('=').next() != Some(key.as_str()));
948 }
949 env_keys.insert(key.clone());
950 env.push(format!("{key}={value}"));
951 }
952
953 if let Some(ref edits_per_device) = cdi_edits {
962 for edits in edits_per_device {
963 for entry in &edits.env {
964 if let Some(key) = entry.split('=').next() {
965 if env_keys.contains(key) {
966 env.retain(|e| e.split('=').next() != Some(key));
967 }
968 env_keys.insert(key.to_string());
969 }
970 env.push(entry.clone());
971 }
972 }
973 } else if let Some(ref gpu) = spec.resources.gpu {
974 let indices: Vec<String> = (0..gpu.count).map(|i| i.to_string()).collect();
976 let device_list = indices.join(",");
977 match gpu.vendor.as_str() {
978 "nvidia" => {
979 env.push(format!("NVIDIA_VISIBLE_DEVICES={device_list}"));
980 env.push(format!("CUDA_VISIBLE_DEVICES={device_list}"));
981 }
982 "amd" => {
983 env.push(format!("ROCR_VISIBLE_DEVICES={device_list}"));
984 env.push(format!("HIP_VISIBLE_DEVICES={device_list}"));
985 }
986 "intel" => {
987 env.push(format!("ZE_AFFINITY_MASK={device_list}"));
988 }
989 _ => {}
990 }
991 }
992
993 let mps_dirs = if let Some(ref gpu) = spec.resources.gpu {
1006 resolve_mps_dirs(gpu)?
1007 } else {
1008 None
1009 };
1010 if let Some(ref dirs) = mps_dirs {
1011 let pipe = format!("CUDA_MPS_PIPE_DIRECTORY={}", dirs.pipe_dir.display());
1012 let log = format!("CUDA_MPS_LOG_DIRECTORY={}", dirs.log_dir.display());
1013 if env_keys.contains("CUDA_MPS_PIPE_DIRECTORY") {
1014 env.retain(|e| e.split('=').next() != Some("CUDA_MPS_PIPE_DIRECTORY"));
1015 }
1016 if env_keys.contains("CUDA_MPS_LOG_DIRECTORY") {
1017 env.retain(|e| e.split('=').next() != Some("CUDA_MPS_LOG_DIRECTORY"));
1018 }
1019 env_keys.insert("CUDA_MPS_PIPE_DIRECTORY".to_string());
1020 env_keys.insert("CUDA_MPS_LOG_DIRECTORY".to_string());
1021 env.push(pipe);
1022 env.push(log);
1023 }
1024 if let Some(ref gpu) = spec.resources.gpu {
1025 if gpu.sharing == Some(GpuSharingMode::TimeSlice) {
1026 if let Some(idx) = gpu.time_slice_index {
1027 env.retain(|e| e.split('=').next() != Some("CUDA_VISIBLE_DEVICES"));
1032 env_keys.insert("CUDA_VISIBLE_DEVICES".to_string());
1033 env.push(format!("CUDA_VISIBLE_DEVICES={idx}"));
1034 }
1035 }
1036 }
1037
1038 if let Some(ref gpu) = spec.resources.gpu {
1042 if let Some(ref dist) = gpu.distributed {
1043 env.push(format!("MASTER_PORT={}", dist.master_port));
1044 env.push(format!("MASTER_ADDR={}", container_id.service));
1045 env.push("WORLD_SIZE=1".to_string());
1046 env.push("RANK=0".to_string());
1047 env.push("LOCAL_RANK=0".to_string());
1048 match dist.backend.as_str() {
1049 "nccl" => env.push("NCCL_SOCKET_IFNAME=eth0".to_string()),
1050 "gloo" => env.push("GLOO_SOCKET_IFNAME=eth0".to_string()),
1051 _ => {}
1052 }
1053 }
1054 }
1055
1056 let capabilities = self.build_capabilities(spec)?;
1058
1059 let cwd = self
1061 .cwd
1062 .clone()
1063 .or_else(|| spec.command.workdir.clone())
1064 .or_else(|| {
1065 self.image_config
1066 .as_ref()
1067 .and_then(|c| c.working_dir.as_ref())
1068 .filter(|w| !w.is_empty())
1069 .cloned()
1070 })
1071 .unwrap_or_else(|| "/".to_string());
1072
1073 let process_args = if let Some(ref args) = self.args {
1075 args.clone()
1076 } else {
1077 Self::resolve_command_from_spec(spec, self.image_config.as_ref())
1078 };
1079
1080 let mut process_builder = ProcessBuilder::default()
1082 .terminal(false)
1083 .user(user)
1084 .env(env)
1085 .args(process_args)
1086 .cwd(cwd)
1087 .no_new_privileges(!spec.privileged && spec.capabilities.is_empty());
1088
1089 if let Some(caps) = capabilities {
1091 process_builder = process_builder.capabilities(caps);
1092 }
1093
1094 let process = process_builder
1095 .build()
1096 .map_err(|e| AgentError::InvalidSpec(format!("failed to build process: {e}")))?;
1097
1098 let root = RootBuilder::default()
1101 .path("rootfs".to_string())
1102 .readonly(false)
1103 .build()
1104 .map_err(|e| AgentError::InvalidSpec(format!("failed to build root: {e}")))?;
1105
1106 let mut mounts = self.build_default_mounts(spec)?;
1108
1109 let storage_mounts = self.build_storage_mounts(spec, volume_paths)?;
1111 mounts.extend(storage_mounts);
1112
1113 if let Some(ref socket_path) = self.socket_path {
1117 mounts.push(
1118 MountBuilder::default()
1119 .destination(zlayer_paths::ZLayerDirs::default_socket_path())
1120 .typ("bind")
1121 .source(socket_path.clone())
1122 .options(vec!["rbind".into(), "ro".into()])
1123 .build()
1124 .expect("valid socket mount"),
1125 );
1126 }
1127
1128 if !spec.host_network && !spec.dns.is_empty() && self.bundle_dir.exists() {
1149 let resolv_path = self.bundle_dir.join("resolv.conf");
1150 let contents = generate_resolv_conf(&spec.dns);
1151 fs::write(&resolv_path, contents).await.map_err(|e| {
1152 AgentError::InvalidSpec(format!(
1153 "failed to write resolv.conf to bundle at {}: {e}",
1154 resolv_path.display()
1155 ))
1156 })?;
1157 mounts.push(
1158 MountBuilder::default()
1159 .destination("/etc/resolv.conf".to_string())
1160 .typ("bind")
1161 .source(resolv_path.to_string_lossy().to_string())
1162 .options(vec!["rbind".to_string(), "ro".to_string()])
1163 .build()
1164 .map_err(|e| {
1165 AgentError::InvalidSpec(format!("failed to build resolv.conf mount: {e}"))
1166 })?,
1167 );
1168 }
1169
1170 if let Some(ref edits_per_device) = cdi_edits {
1173 for edits in edits_per_device {
1174 for cdi_mount in &edits.mounts {
1175 let mut opts = cdi_mount.options.clone();
1176 if !opts.iter().any(|o| o == "bind" || o == "rbind") {
1177 opts.push("rbind".to_string());
1178 }
1179 mounts.push(
1180 MountBuilder::default()
1181 .destination(cdi_mount.container_path.clone())
1182 .typ("bind")
1183 .source(cdi_mount.host_path.clone())
1184 .options(opts)
1185 .build()
1186 .map_err(|e| {
1187 AgentError::InvalidSpec(format!("failed to build CDI mount: {e}"))
1188 })?,
1189 );
1190 }
1191 }
1192 }
1193
1194 if let Some(ref dirs) = mps_dirs {
1206 mounts.push(
1207 MountBuilder::default()
1208 .destination(dirs.pipe_dir.clone())
1209 .typ("bind")
1210 .source(dirs.pipe_dir.clone())
1211 .options(vec!["rbind".into(), "rw".into()])
1212 .build()
1213 .map_err(|e| {
1214 AgentError::InvalidSpec(format!("failed to build MPS pipe mount: {e}"))
1215 })?,
1216 );
1217 mounts.push(
1218 MountBuilder::default()
1219 .destination(dirs.log_dir.clone())
1220 .typ("bind")
1221 .source(dirs.log_dir.clone())
1222 .options(vec!["rbind".into(), "rw".into()])
1223 .build()
1224 .map_err(|e| {
1225 AgentError::InvalidSpec(format!("failed to build MPS log mount: {e}"))
1226 })?,
1227 );
1228 }
1229 if let Some(ref gpu) = spec.resources.gpu {
1230 if gpu.sharing == Some(GpuSharingMode::TimeSlice) {
1231 if let Some(ref cfg_path) = gpu.time_slicing_config_path {
1232 let host = PathBuf::from(cfg_path);
1233 if !host.is_file() {
1234 return Err(AgentError::GpuSharingUnavailable {
1235 mode: "time-slice".to_string(),
1236 reason: format!(
1237 "time-slicing config {} is not a regular file on the host",
1238 host.display()
1239 ),
1240 });
1241 }
1242 mounts.push(
1243 MountBuilder::default()
1244 .destination(PathBuf::from(TIMESLICE_CONFIG_CONTAINER_PATH))
1245 .typ("bind")
1246 .source(host)
1247 .options(vec!["rbind".into(), "ro".into()])
1248 .build()
1249 .map_err(|e| {
1250 AgentError::InvalidSpec(format!(
1251 "failed to build time-slicing config mount: {e}"
1252 ))
1253 })?,
1254 );
1255 }
1256 }
1257 }
1258
1259 let linux = self.build_linux_config(container_id, spec, cdi_edits.as_deref())?;
1261
1262 let hostname = self
1264 .hostname
1265 .clone()
1266 .unwrap_or_else(|| container_id.to_string());
1267
1268 let mut spec_builder = SpecBuilder::default()
1270 .version("1.0.2".to_string())
1271 .root(root)
1272 .process(process)
1273 .hostname(hostname)
1274 .mounts(mounts)
1275 .linux(linux);
1276
1277 if let Some(ref edits_per_device) = cdi_edits {
1278 if let Some(hooks) = Self::build_hooks_from_cdi(edits_per_device)? {
1279 spec_builder = spec_builder.hooks(hooks);
1280 }
1281 }
1282
1283 let oci_spec = spec_builder
1284 .build()
1285 .map_err(|e| AgentError::InvalidSpec(format!("failed to build OCI spec: {e}")))?;
1286
1287 Ok(oci_spec)
1288 }
1289
1290 fn build_hooks_from_cdi(edits_per_device: &[CdiContainerEdits]) -> Result<Option<Hooks>> {
1297 let mut prestart: Vec<Hook> = Vec::new();
1298 let mut create_runtime: Vec<Hook> = Vec::new();
1299 let mut create_container: Vec<Hook> = Vec::new();
1300 let mut start_container: Vec<Hook> = Vec::new();
1301 let mut poststart: Vec<Hook> = Vec::new();
1302 let mut poststop: Vec<Hook> = Vec::new();
1303
1304 for edits in edits_per_device {
1305 let Some(ref h) = edits.hooks else { continue };
1306 for hook in &h.prestart {
1307 prestart.push(convert_cdi_hook(hook)?);
1308 }
1309 for hook in &h.create_runtime {
1310 create_runtime.push(convert_cdi_hook(hook)?);
1311 }
1312 for hook in &h.create_container {
1313 create_container.push(convert_cdi_hook(hook)?);
1314 }
1315 for hook in &h.start_container {
1316 start_container.push(convert_cdi_hook(hook)?);
1317 }
1318 for hook in &h.poststart {
1319 poststart.push(convert_cdi_hook(hook)?);
1320 }
1321 for hook in &h.poststop {
1322 poststop.push(convert_cdi_hook(hook)?);
1323 }
1324 }
1325
1326 if prestart.is_empty()
1327 && create_runtime.is_empty()
1328 && create_container.is_empty()
1329 && start_container.is_empty()
1330 && poststart.is_empty()
1331 && poststop.is_empty()
1332 {
1333 return Ok(None);
1334 }
1335
1336 let mut builder = HooksBuilder::default();
1337 if !prestart.is_empty() {
1338 #[allow(deprecated)]
1339 {
1340 builder = builder.prestart(prestart);
1341 }
1342 }
1343 if !create_runtime.is_empty() {
1344 builder = builder.create_runtime(create_runtime);
1345 }
1346 if !create_container.is_empty() {
1347 builder = builder.create_container(create_container);
1348 }
1349 if !start_container.is_empty() {
1350 builder = builder.start_container(start_container);
1351 }
1352 if !poststart.is_empty() {
1353 builder = builder.poststart(poststart);
1354 }
1355 if !poststop.is_empty() {
1356 builder = builder.poststop(poststop);
1357 }
1358
1359 let hooks = builder
1360 .build()
1361 .map_err(|e| AgentError::InvalidSpec(format!("failed to build CDI hooks: {e}")))?;
1362 Ok(Some(hooks))
1363 }
1364
1365 #[allow(clippy::unused_self)]
1367 fn build_capabilities(
1368 &self,
1369 spec: &ServiceSpec,
1370 ) -> Result<Option<oci_spec::runtime::LinuxCapabilities>> {
1371 if spec.privileged {
1372 let all_caps: HashSet<Capability> = ALL_CAPABILITIES.iter().copied().collect();
1374 let empty_caps: HashSet<Capability> = HashSet::new();
1375
1376 let caps = LinuxCapabilitiesBuilder::default()
1377 .bounding(all_caps.clone())
1378 .effective(all_caps.clone())
1379 .permitted(all_caps)
1380 .inheritable(empty_caps.clone())
1381 .ambient(empty_caps)
1382 .build()
1383 .map_err(|e| {
1384 AgentError::InvalidSpec(format!("failed to build capabilities: {e}"))
1385 })?;
1386
1387 Ok(Some(caps))
1388 } else if !spec.capabilities.is_empty() {
1389 let caps: HashSet<Capability> = spec
1391 .capabilities
1392 .iter()
1393 .filter_map(|c| {
1394 let cap_name = if c.starts_with("CAP_") {
1396 c.to_uppercase()
1397 } else {
1398 format!("CAP_{}", c.to_uppercase())
1399 };
1400 Capability::from_str(&cap_name).ok()
1401 })
1402 .collect();
1403
1404 let empty_caps: HashSet<Capability> = HashSet::new();
1405
1406 let built_caps = LinuxCapabilitiesBuilder::default()
1407 .bounding(caps.clone())
1408 .effective(caps.clone())
1409 .permitted(caps)
1410 .inheritable(empty_caps.clone())
1411 .ambient(empty_caps)
1412 .build()
1413 .map_err(|e| {
1414 AgentError::InvalidSpec(format!("failed to build capabilities: {e}"))
1415 })?;
1416
1417 Ok(Some(built_caps))
1418 } else {
1419 let default_caps: HashSet<Capability> = [
1421 Capability::Chown,
1422 Capability::DacOverride,
1423 Capability::Fsetid,
1424 Capability::Fowner,
1425 Capability::Mknod,
1426 Capability::NetRaw,
1427 Capability::Setgid,
1428 Capability::Setuid,
1429 Capability::Setfcap,
1430 Capability::Setpcap,
1431 Capability::NetBindService,
1432 Capability::SysChroot,
1433 Capability::Kill,
1434 Capability::AuditWrite,
1435 ]
1436 .into_iter()
1437 .collect();
1438
1439 let empty_caps: HashSet<Capability> = HashSet::new();
1440
1441 let built_caps = LinuxCapabilitiesBuilder::default()
1442 .bounding(default_caps.clone())
1443 .effective(default_caps.clone())
1444 .permitted(default_caps)
1445 .inheritable(empty_caps.clone())
1446 .ambient(empty_caps)
1447 .build()
1448 .map_err(|e| {
1449 AgentError::InvalidSpec(format!("failed to build capabilities: {e}"))
1450 })?;
1451
1452 Ok(Some(built_caps))
1453 }
1454 }
1455
1456 #[allow(clippy::unused_self, clippy::too_many_lines)]
1458 fn build_default_mounts(&self, spec: &ServiceSpec) -> Result<Vec<Mount>> {
1459 let mut mounts = Vec::new();
1460
1461 mounts.push(
1463 MountBuilder::default()
1464 .destination("/proc".to_string())
1465 .typ("proc".to_string())
1466 .source("proc".to_string())
1467 .options(vec![
1468 "nosuid".to_string(),
1469 "noexec".to_string(),
1470 "nodev".to_string(),
1471 ])
1472 .build()
1473 .map_err(|e| {
1474 AgentError::InvalidSpec(format!("failed to build /proc mount: {e}"))
1475 })?,
1476 );
1477
1478 mounts.push(
1480 MountBuilder::default()
1481 .destination("/dev".to_string())
1482 .typ("tmpfs".to_string())
1483 .source("tmpfs".to_string())
1484 .options(vec![
1485 "nosuid".to_string(),
1486 "strictatime".to_string(),
1487 "mode=755".to_string(),
1488 "size=65536k".to_string(),
1489 ])
1490 .build()
1491 .map_err(|e| AgentError::InvalidSpec(format!("failed to build /dev mount: {e}")))?,
1492 );
1493
1494 mounts.push(
1496 MountBuilder::default()
1497 .destination("/dev/pts".to_string())
1498 .typ("devpts".to_string())
1499 .source("devpts".to_string())
1500 .options(vec![
1501 "nosuid".to_string(),
1502 "noexec".to_string(),
1503 "newinstance".to_string(),
1504 "ptmxmode=0666".to_string(),
1505 "mode=0620".to_string(),
1506 "gid=5".to_string(),
1507 ])
1508 .build()
1509 .map_err(|e| {
1510 AgentError::InvalidSpec(format!("failed to build /dev/pts mount: {e}"))
1511 })?,
1512 );
1513
1514 mounts.push(
1516 MountBuilder::default()
1517 .destination("/dev/shm".to_string())
1518 .typ("tmpfs".to_string())
1519 .source("shm".to_string())
1520 .options(vec![
1521 "nosuid".to_string(),
1522 "noexec".to_string(),
1523 "nodev".to_string(),
1524 "mode=1777".to_string(),
1525 "size=65536k".to_string(),
1526 ])
1527 .build()
1528 .map_err(|e| {
1529 AgentError::InvalidSpec(format!("failed to build /dev/shm mount: {e}"))
1530 })?,
1531 );
1532
1533 mounts.push(
1535 MountBuilder::default()
1536 .destination("/dev/mqueue".to_string())
1537 .typ("mqueue".to_string())
1538 .source("mqueue".to_string())
1539 .options(vec![
1540 "nosuid".to_string(),
1541 "noexec".to_string(),
1542 "nodev".to_string(),
1543 ])
1544 .build()
1545 .map_err(|e| {
1546 AgentError::InvalidSpec(format!("failed to build /dev/mqueue mount: {e}"))
1547 })?,
1548 );
1549
1550 let sys_options = if spec.privileged {
1552 vec![
1553 "nosuid".to_string(),
1554 "noexec".to_string(),
1555 "nodev".to_string(),
1556 ]
1557 } else {
1558 vec![
1559 "nosuid".to_string(),
1560 "noexec".to_string(),
1561 "nodev".to_string(),
1562 "ro".to_string(),
1563 ]
1564 };
1565
1566 mounts.push(
1567 MountBuilder::default()
1568 .destination("/sys".to_string())
1569 .typ("sysfs".to_string())
1570 .source("sysfs".to_string())
1571 .options(sys_options)
1572 .build()
1573 .map_err(|e| AgentError::InvalidSpec(format!("failed to build /sys mount: {e}")))?,
1574 );
1575
1576 mounts.push(
1578 MountBuilder::default()
1579 .destination("/sys/fs/cgroup".to_string())
1580 .typ("cgroup2".to_string())
1581 .source("cgroup".to_string())
1582 .options(vec![
1583 "nosuid".to_string(),
1584 "noexec".to_string(),
1585 "nodev".to_string(),
1586 "relatime".to_string(),
1587 ])
1588 .build()
1589 .map_err(|e| {
1590 AgentError::InvalidSpec(format!("failed to build cgroup mount: {e}"))
1591 })?,
1592 );
1593
1594 Ok(mounts)
1595 }
1596
1597 #[allow(clippy::unused_self, clippy::too_many_lines)]
1603 fn build_storage_mounts(
1604 &self,
1605 spec: &ServiceSpec,
1606 volume_paths: &std::collections::HashMap<String, PathBuf>,
1607 ) -> Result<Vec<Mount>> {
1608 let mut mounts = Vec::new();
1609
1610 for storage in &spec.storage {
1611 let mount = match storage {
1612 StorageSpec::Bind {
1613 source,
1614 target,
1615 readonly,
1616 } => {
1617 let mut options = vec!["rbind".to_string()];
1618 if *readonly {
1619 options.push("ro".to_string());
1620 } else {
1621 options.push("rw".to_string());
1622 }
1623
1624 MountBuilder::default()
1625 .destination(target.clone())
1626 .typ("none".to_string())
1627 .source(source.clone())
1628 .options(options)
1629 .build()
1630 .map_err(|e| {
1631 AgentError::InvalidSpec(format!(
1632 "failed to build bind mount for {target}: {e}"
1633 ))
1634 })?
1635 }
1636
1637 StorageSpec::Named {
1638 name,
1639 target,
1640 readonly,
1641 tier,
1642 ..
1643 } => {
1644 let source = volume_paths.get(name).ok_or_else(|| {
1646 AgentError::InvalidSpec(format!(
1647 "volume '{name}' not prepared - ensure StorageManager.ensure_volume() was called"
1648 ))
1649 })?;
1650
1651 if matches!(tier, StorageTier::Network) {
1653 tracing::warn!(
1654 volume = %name,
1655 tier = ?tier,
1656 "Network storage tier is NOT SQLite-safe. Avoid using SQLite databases on this volume."
1657 );
1658 }
1659
1660 let mut options = vec!["rbind".to_string()];
1661 if *readonly {
1662 options.push("ro".to_string());
1663 } else {
1664 options.push("rw".to_string());
1665 }
1666
1667 MountBuilder::default()
1668 .destination(target.clone())
1669 .typ("none".to_string())
1670 .source(source.to_string_lossy().to_string())
1671 .options(options)
1672 .build()
1673 .map_err(|e| {
1674 AgentError::InvalidSpec(format!(
1675 "failed to build named volume mount for {target}: {e}"
1676 ))
1677 })?
1678 }
1679
1680 StorageSpec::Anonymous { target, tier } => {
1681 let key = format!("_anon_{}", target.trim_start_matches('/').replace('/', "_"));
1684 let source = volume_paths.get(&key).ok_or_else(|| {
1685 AgentError::InvalidSpec(format!(
1686 "anonymous volume for '{target}' not prepared"
1687 ))
1688 })?;
1689
1690 if matches!(tier, StorageTier::Network) {
1691 tracing::warn!(
1692 target = %target,
1693 tier = ?tier,
1694 "Network storage tier is NOT SQLite-safe."
1695 );
1696 }
1697
1698 let options = vec!["rbind".to_string(), "rw".to_string()];
1699
1700 MountBuilder::default()
1701 .destination(target.clone())
1702 .typ("none".to_string())
1703 .source(source.to_string_lossy().to_string())
1704 .options(options)
1705 .build()
1706 .map_err(|e| {
1707 AgentError::InvalidSpec(format!(
1708 "failed to build anonymous volume mount for {target}: {e}"
1709 ))
1710 })?
1711 }
1712
1713 StorageSpec::Tmpfs { target, size, mode } => {
1714 let mut options = vec!["nosuid".to_string(), "nodev".to_string()];
1715
1716 if let Some(size_str) = size {
1717 options.push(format!("size={size_str}"));
1718 }
1719
1720 if let Some(mode_val) = mode {
1721 options.push(format!("mode={mode_val:o}"));
1722 }
1723
1724 MountBuilder::default()
1725 .destination(target.clone())
1726 .typ("tmpfs".to_string())
1727 .source("tmpfs".to_string())
1728 .options(options)
1729 .build()
1730 .map_err(|e| {
1731 AgentError::InvalidSpec(format!(
1732 "failed to build tmpfs mount for {target}: {e}"
1733 ))
1734 })?
1735 }
1736
1737 StorageSpec::S3 {
1738 bucket,
1739 prefix,
1740 target,
1741 readonly,
1742 endpoint: _,
1743 credentials: _,
1744 } => {
1745 let key = format!("_s3_{}_{}", bucket, prefix.as_deref().unwrap_or(""));
1748 let source = volume_paths.get(&key).ok_or_else(|| {
1749 AgentError::InvalidSpec(format!(
1750 "S3 volume for bucket '{bucket}' not mounted - ensure StorageManager.mount_s3() was called"
1751 ))
1752 })?;
1753
1754 tracing::warn!(
1755 bucket = %bucket,
1756 target = %target,
1757 "S3 storage is NOT SQLite-safe. Use for read-heavy workloads only."
1758 );
1759
1760 let mut options = vec!["rbind".to_string()];
1761 if *readonly {
1762 options.push("ro".to_string());
1763 } else {
1764 options.push("rw".to_string());
1765 }
1766
1767 MountBuilder::default()
1768 .destination(target.clone())
1769 .typ("none".to_string())
1770 .source(source.to_string_lossy().to_string())
1771 .options(options)
1772 .build()
1773 .map_err(|e| {
1774 AgentError::InvalidSpec(format!(
1775 "failed to build S3 mount for {target}: {e}"
1776 ))
1777 })?
1778 }
1779 };
1780
1781 mounts.push(mount);
1782 }
1783
1784 Ok(mounts)
1785 }
1786
1787 #[allow(clippy::similar_names)] #[allow(clippy::too_many_lines)]
1790 fn build_linux_config(
1791 &self,
1792 container_id: &ContainerId,
1793 spec: &ServiceSpec,
1794 cdi_edits: Option<&[CdiContainerEdits]>,
1795 ) -> Result<oci_spec::runtime::Linux> {
1796 let mut namespaces = vec![
1798 LinuxNamespaceBuilder::default()
1799 .typ(LinuxNamespaceType::Pid)
1800 .build()
1801 .unwrap(),
1802 LinuxNamespaceBuilder::default()
1803 .typ(LinuxNamespaceType::Ipc)
1804 .build()
1805 .unwrap(),
1806 LinuxNamespaceBuilder::default()
1807 .typ(LinuxNamespaceType::Uts)
1808 .build()
1809 .unwrap(),
1810 LinuxNamespaceBuilder::default()
1811 .typ(LinuxNamespaceType::Mount)
1812 .build()
1813 .unwrap(),
1814 ];
1815
1816 if !self.host_network {
1820 namespaces.push(
1821 LinuxNamespaceBuilder::default()
1822 .typ(LinuxNamespaceType::Network)
1823 .build()
1824 .unwrap(),
1825 );
1826 }
1827
1828 #[cfg(unix)]
1833 let rootless = !nix::unistd::geteuid().is_root();
1834 #[cfg(not(unix))]
1835 let rootless = false;
1836
1837 if rootless {
1838 namespaces.push(
1839 LinuxNamespaceBuilder::default()
1840 .typ(LinuxNamespaceType::User)
1841 .build()
1842 .unwrap(),
1843 );
1844 namespaces.push(
1845 LinuxNamespaceBuilder::default()
1846 .typ(LinuxNamespaceType::Cgroup)
1847 .build()
1848 .unwrap(),
1849 );
1850 }
1851
1852 let mut linux_builder = LinuxBuilder::default().namespaces(namespaces);
1853
1854 #[cfg(unix)]
1855 if rootless {
1856 let euid = nix::unistd::geteuid();
1857 let egid = nix::unistd::getegid();
1858 let username = nix::unistd::User::from_uid(euid)
1859 .ok()
1860 .flatten()
1861 .map(|u| u.name)
1862 .unwrap_or_default();
1863 linux_builder = linux_builder
1864 .uid_mappings(build_rootless_id_mappings(
1865 euid.as_raw(),
1866 "/etc/subuid",
1867 &username,
1868 ))
1869 .gid_mappings(build_rootless_id_mappings(
1870 egid.as_raw(),
1871 "/etc/subgid",
1872 &username,
1873 ));
1874 }
1875
1876 let resources = self.build_resources(spec)?;
1878 if let Some(resources) = resources {
1879 linux_builder = linux_builder.resources(resources);
1880 }
1881
1882 let mut devices = self.build_devices(spec, None, cdi_edits.is_some())?;
1889 if let Some(edits_per_device) = cdi_edits {
1890 for edits in edits_per_device {
1891 for node in &edits.device_nodes {
1892 devices.push(cdi_node_to_oci_device(node)?);
1893 }
1894 }
1895 }
1896 if !devices.is_empty() {
1897 linux_builder = linux_builder.devices(devices);
1898 }
1899
1900 linux_builder = linux_builder.rootfs_propagation("private".to_string());
1902
1903 if spec.privileged {
1905 linux_builder = linux_builder.masked_paths(vec![]).readonly_paths(vec![]);
1907 } else {
1908 let masked_paths = vec![
1910 "/proc/acpi".to_string(),
1911 "/proc/asound".to_string(),
1912 "/proc/kcore".to_string(),
1913 "/proc/keys".to_string(),
1914 "/proc/latency_stats".to_string(),
1915 "/proc/timer_list".to_string(),
1916 "/proc/timer_stats".to_string(),
1917 "/proc/sched_debug".to_string(),
1918 "/proc/scsi".to_string(),
1919 "/sys/firmware".to_string(),
1920 ];
1921
1922 let readonly_paths = vec![
1924 "/proc/bus".to_string(),
1925 "/proc/fs".to_string(),
1926 "/proc/irq".to_string(),
1927 "/proc/sys".to_string(),
1928 "/proc/sysrq-trigger".to_string(),
1929 ];
1930
1931 linux_builder = linux_builder
1932 .masked_paths(masked_paths)
1933 .readonly_paths(readonly_paths);
1934 }
1935
1936 let cid = container_id.to_string();
1948
1949 let explicit_parent: Option<(String, &'static str)> =
1953 if let Some(p) = spec.cgroup_parent.as_deref().filter(|s| !s.is_empty()) {
1954 Some((p.to_string(), "spec"))
1955 } else if let Some(p) = std::env::var("ZLAYER_CGROUP_PARENT")
1956 .ok()
1957 .filter(|s| !s.is_empty())
1958 {
1959 Some((p, "env"))
1960 } else {
1961 None
1962 };
1963
1964 #[cfg(target_os = "linux")]
1970 let auto_parent: Option<(String, &'static str)> =
1971 if let Some(p) = crate::capability::ensure_daemon_leaf_and_container_parent() {
1972 Some((p, "auto-init"))
1973 } else if let Some(p) = crate::capability::current_cgroup_v2_path() {
1974 Some((p, "auto"))
1977 } else {
1978 None
1979 };
1980 #[cfg(not(target_os = "linux"))]
1981 let auto_parent: Option<(String, &'static str)> = None;
1982
1983 let (cgroup_parent_value, cgroup_parent_source): (Option<String>, &'static str) =
1984 explicit_parent
1985 .or(auto_parent)
1986 .map_or((None, "none"), |(p, s)| (Some(p), s));
1987
1988 #[cfg(target_os = "linux")]
1995 if cgroup_parent_value.is_none() && crate::capability::DaemonCapabilities::get().is_nested {
1996 tracing::warn!(
1997 container_id = %cid,
1998 "capability survey reports nested daemon but cgroup_parent could not be resolved — proceeding with v2 root"
1999 );
2000 }
2001
2002 if let Some(parent) = cgroup_parent_value {
2003 let parent = parent.trim_end_matches('/');
2004 let full = format!("{parent}/{cid}");
2005 match cgroup_parent_source {
2006 "spec" => tracing::info!(
2007 container_id = %cid,
2008 source = "spec",
2009 path = %full,
2010 "cgroup_parent selected"
2011 ),
2012 "env" => tracing::info!(
2013 container_id = %cid,
2014 source = "env",
2015 path = %full,
2016 "cgroup_parent selected"
2017 ),
2018 "auto" => tracing::info!(
2019 container_id = %cid,
2020 source = "auto",
2021 path = %full,
2022 "cgroup_parent selected (from /proc/self/cgroup)"
2023 ),
2024 "auto-init" => tracing::info!(
2025 container_id = %cid,
2026 source = "auto-init",
2027 path = %full,
2028 "cgroup_parent selected (migrated daemon to <scope>/init; containers go under <scope>/containers)"
2029 ),
2030 _ => unreachable!(),
2031 }
2032 linux_builder = linux_builder.cgroups_path(std::path::PathBuf::from(full));
2033 } else {
2034 #[cfg(target_os = "linux")]
2043 {
2044 let caps = crate::capability::DaemonCapabilities::get();
2045 if !caps.can_write_cgroup_root {
2046 return Err(AgentError::InvalidSpec(format!(
2047 "cannot create container {cid}: no writable cgroup parent. \
2048 /proc/self/cgroup reports the cgroup-v2 root, and \
2049 /sys/fs/cgroup is read-only to this process. Fix one of: \
2050 (a) run the daemon's outer container with --cgroupns=host \
2051 so /proc/self/cgroup reports a real parent; \
2052 (b) set ZLAYER_CGROUP_PARENT=/path/to/writable/cgroup; \
2053 (c) grant the daemon write access to /sys/fs/cgroup."
2054 )));
2055 }
2056 tracing::info!(
2057 container_id = %cid,
2058 "cgroup_parent unset — libcontainer will use v2 root (cgroup root is writable here)"
2059 );
2060 }
2061 #[cfg(not(target_os = "linux"))]
2062 tracing::debug!(
2063 container_id = %cid,
2064 "non-Linux host — cgroup_parent unset; libcontainer inside the WSL distro will resolve a parent from its cgroup-v2 root"
2065 );
2066 }
2067
2068 linux_builder
2069 .build()
2070 .map_err(|e| AgentError::InvalidSpec(format!("failed to build linux config: {e}")))
2071 }
2072
2073 #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
2075 fn build_resources(
2076 &self,
2077 spec: &ServiceSpec,
2078 ) -> Result<Option<oci_spec::runtime::LinuxResources>> {
2079 let mut resources_builder = LinuxResourcesBuilder::default();
2080 let mut has_resources = false;
2081
2082 if let Some(cpu_limit) = spec.resources.cpu {
2084 let quota = (cpu_limit * 100_000.0) as i64;
2087 let cpu = LinuxCpuBuilder::default()
2088 .quota(quota)
2089 .period(100_000u64)
2090 .build()
2091 .map_err(|e| AgentError::InvalidSpec(format!("failed to build CPU limits: {e}")))?;
2092
2093 resources_builder = resources_builder.cpu(cpu);
2094 has_resources = true;
2095 }
2096
2097 if let Some(ref memory_str) = spec.resources.memory {
2099 let bytes = parse_memory_string(memory_str)
2100 .map_err(|e| AgentError::InvalidSpec(format!("invalid memory limit: {e}")))?;
2101
2102 let memory = LinuxMemoryBuilder::default()
2103 .limit(bytes as i64)
2104 .build()
2105 .map_err(|e| {
2106 AgentError::InvalidSpec(format!("failed to build memory limits: {e}"))
2107 })?;
2108
2109 resources_builder = resources_builder.memory(memory);
2110 has_resources = true;
2111 }
2112
2113 let device_rules = self.build_device_cgroup_rules(spec, None)?;
2115 if !device_rules.is_empty() {
2116 resources_builder = resources_builder.devices(device_rules);
2117 has_resources = true;
2118 }
2119
2120 if has_resources {
2121 let resources = resources_builder
2122 .build()
2123 .map_err(|e| AgentError::InvalidSpec(format!("failed to build resources: {e}")))?;
2124 Ok(Some(resources))
2125 } else {
2126 Ok(None)
2127 }
2128 }
2129
2130 #[allow(clippy::unused_self, clippy::too_many_lines)]
2132 fn build_device_cgroup_rules(
2133 &self,
2134 spec: &ServiceSpec,
2135 _gpu_indices: Option<&[u32]>,
2136 ) -> Result<Vec<oci_spec::runtime::LinuxDeviceCgroup>> {
2137 let mut rules = Vec::new();
2138
2139 if spec.privileged {
2140 let rule = LinuxDeviceCgroupBuilder::default()
2142 .allow(true)
2143 .access("rwm".to_string())
2144 .build()
2145 .map_err(|e| {
2146 AgentError::InvalidSpec(format!("failed to build device cgroup rule: {e}"))
2147 })?;
2148 rules.push(rule);
2149 } else {
2150 let deny_all = LinuxDeviceCgroupBuilder::default()
2152 .allow(false)
2153 .access("rwm".to_string())
2154 .build()
2155 .map_err(|e| AgentError::InvalidSpec(format!("failed to build deny rule: {e}")))?;
2156 rules.push(deny_all);
2157
2158 let standard_char_devices = [
2161 (1, 3, "rwm"), (1, 5, "rwm"), (1, 7, "rwm"), (1, 8, "rwm"), (1, 9, "rwm"), (5, 0, "rwm"), (5, 1, "rwm"), (5, 2, "rwm"), (136, -1, "rwm"), ];
2171
2172 for (major, minor, access) in standard_char_devices {
2173 let mut builder = LinuxDeviceCgroupBuilder::default()
2174 .allow(true)
2175 .typ(LinuxDeviceType::C)
2176 .major(i64::from(major))
2177 .access(access.to_string());
2178
2179 if minor >= 0 {
2180 builder = builder.minor(i64::from(minor));
2181 }
2182
2183 let rule = builder.build().map_err(|e| {
2184 AgentError::InvalidSpec(format!("failed to build char device rule: {e}"))
2185 })?;
2186 rules.push(rule);
2187 }
2188
2189 #[cfg(unix)]
2193 for device in &spec.devices {
2194 if let Ok((major, minor)) = get_device_major_minor(&device.path) {
2195 let dev_type = get_device_type(&device.path).unwrap_or(LinuxDeviceType::C);
2196
2197 let mut access = String::new();
2199 if device.read {
2200 access.push('r');
2201 }
2202 if device.write {
2203 access.push('w');
2204 }
2205 if device.mknod {
2206 access.push('m');
2207 }
2208 if access.is_empty() {
2209 access = "rw".to_string();
2210 }
2211
2212 let rule = LinuxDeviceCgroupBuilder::default()
2213 .allow(true)
2214 .typ(dev_type)
2215 .major(major)
2216 .minor(minor)
2217 .access(access)
2218 .build()
2219 .map_err(|e| {
2220 AgentError::InvalidSpec(format!(
2221 "failed to build device rule for {}: {}",
2222 device.path, e
2223 ))
2224 })?;
2225 rules.push(rule);
2226 } else {
2227 tracing::warn!("Failed to get device info for {}, skipping", device.path);
2228 }
2229 }
2230
2231 if let Some(ref gpu) = spec.resources.gpu {
2233 match gpu.vendor.as_str() {
2234 "nvidia" => {
2235 let rule = LinuxDeviceCgroupBuilder::default()
2237 .allow(true)
2238 .typ(LinuxDeviceType::C)
2239 .major(195i64)
2240 .access("rwm".to_string())
2241 .build()
2242 .map_err(|e| {
2243 AgentError::InvalidSpec(format!(
2244 "failed to build GPU cgroup rule: {e}"
2245 ))
2246 })?;
2247 rules.push(rule);
2248
2249 let uvm_rule = LinuxDeviceCgroupBuilder::default()
2251 .allow(true)
2252 .typ(LinuxDeviceType::C)
2253 .major(510i64)
2254 .access("rwm".to_string())
2255 .build()
2256 .map_err(|e| {
2257 AgentError::InvalidSpec(format!(
2258 "failed to build GPU UVM cgroup rule: {e}"
2259 ))
2260 })?;
2261 rules.push(uvm_rule);
2262 }
2263 "amd" => {
2264 let dri_rule = LinuxDeviceCgroupBuilder::default()
2266 .allow(true)
2267 .typ(LinuxDeviceType::C)
2268 .major(226i64)
2269 .access("rwm".to_string())
2270 .build()
2271 .map_err(|e| {
2272 AgentError::InvalidSpec(format!(
2273 "failed to build AMD DRI cgroup rule: {e}"
2274 ))
2275 })?;
2276 rules.push(dri_rule);
2277
2278 let kfd_rule = LinuxDeviceCgroupBuilder::default()
2280 .allow(true)
2281 .typ(LinuxDeviceType::C)
2282 .major(234i64)
2283 .access("rwm".to_string())
2284 .build()
2285 .map_err(|e| {
2286 AgentError::InvalidSpec(format!(
2287 "failed to build AMD KFD cgroup rule: {e}"
2288 ))
2289 })?;
2290 rules.push(kfd_rule);
2291 }
2292 "intel" => {
2293 let dri_rule = LinuxDeviceCgroupBuilder::default()
2295 .allow(true)
2296 .typ(LinuxDeviceType::C)
2297 .major(226i64)
2298 .access("rwm".to_string())
2299 .build()
2300 .map_err(|e| {
2301 AgentError::InvalidSpec(format!(
2302 "failed to build Intel DRI cgroup rule: {e}"
2303 ))
2304 })?;
2305 rules.push(dri_rule);
2306 }
2307 other => {
2308 tracing::warn!(
2310 vendor = %other,
2311 "Unknown GPU vendor, allowing DRI devices (major 226)"
2312 );
2313 let dri_rule = LinuxDeviceCgroupBuilder::default()
2314 .allow(true)
2315 .typ(LinuxDeviceType::C)
2316 .major(226i64)
2317 .access("rwm".to_string())
2318 .build()
2319 .map_err(|e| {
2320 AgentError::InvalidSpec(format!(
2321 "failed to build GPU DRI cgroup rule: {e}"
2322 ))
2323 })?;
2324 rules.push(dri_rule);
2325 }
2326 }
2327 }
2328 }
2329
2330 Ok(rules)
2331 }
2332
2333 #[allow(clippy::unused_self, clippy::too_many_lines)]
2342 #[cfg_attr(not(unix), allow(clippy::unnecessary_wraps, clippy::needless_return))]
2343 fn build_devices(
2344 &self,
2345 spec: &ServiceSpec,
2346 gpu_indices: Option<&[u32]>,
2347 skip_gpu_defaults: bool,
2348 ) -> Result<Vec<oci_spec::runtime::LinuxDevice>> {
2349 #[cfg(not(unix))]
2350 {
2351 let _ = (spec, gpu_indices, skip_gpu_defaults);
2352 return Ok(Vec::new());
2353 }
2354
2355 #[cfg(unix)]
2356 {
2357 let mut devices = Vec::new();
2358
2359 for device in &spec.devices {
2360 if let Ok((major, minor)) = get_device_major_minor(&device.path) {
2361 let dev_type = get_device_type(&device.path).unwrap_or(LinuxDeviceType::C);
2362
2363 let linux_device = LinuxDeviceBuilder::default()
2364 .path(device.path.clone())
2365 .typ(dev_type)
2366 .major(major)
2367 .minor(minor)
2368 .file_mode(0o666u32)
2369 .uid(0u32)
2370 .gid(0u32)
2371 .build()
2372 .map_err(|e| {
2373 AgentError::InvalidSpec(format!(
2374 "failed to build device {}: {}",
2375 device.path, e
2376 ))
2377 })?;
2378
2379 devices.push(linux_device);
2380 }
2381 }
2382
2383 if skip_gpu_defaults {
2388 return Ok(devices);
2389 }
2390
2391 if let Some(ref gpu) = spec.resources.gpu {
2393 let indices: Vec<u32> =
2394 gpu_indices.map_or_else(|| (0..gpu.count).collect(), <[u32]>::to_vec);
2395
2396 match gpu.vendor.as_str() {
2397 "nvidia" => {
2398 let always_devices =
2400 ["/dev/nvidiactl", "/dev/nvidia-uvm", "/dev/nvidia-uvm-tools"];
2401 for dev_path in &always_devices {
2402 if let Ok((major, minor)) = get_device_major_minor(dev_path) {
2403 let dev_type =
2404 get_device_type(dev_path).unwrap_or(LinuxDeviceType::C);
2405 let linux_device = LinuxDeviceBuilder::default()
2406 .path((*dev_path).to_string())
2407 .typ(dev_type)
2408 .major(major)
2409 .minor(minor)
2410 .file_mode(0o666u32)
2411 .uid(0u32)
2412 .gid(0u32)
2413 .build()
2414 .map_err(|e| {
2415 AgentError::InvalidSpec(format!(
2416 "failed to build GPU device {dev_path}: {e}"
2417 ))
2418 })?;
2419 devices.push(linux_device);
2420 } else {
2421 tracing::warn!(
2422 "GPU device {} not found on host, skipping",
2423 dev_path
2424 );
2425 }
2426 }
2427
2428 for i in &indices {
2430 let dev_path = format!("/dev/nvidia{i}");
2431 if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2432 let dev_type =
2433 get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2434 let linux_device = LinuxDeviceBuilder::default()
2435 .path(dev_path.clone())
2436 .typ(dev_type)
2437 .major(major)
2438 .minor(minor)
2439 .file_mode(0o666u32)
2440 .uid(0u32)
2441 .gid(0u32)
2442 .build()
2443 .map_err(|e| {
2444 AgentError::InvalidSpec(format!(
2445 "failed to build GPU device {dev_path}: {e}"
2446 ))
2447 })?;
2448 devices.push(linux_device);
2449 } else {
2450 tracing::warn!(
2451 "GPU device {} not found on host, skipping",
2452 dev_path
2453 );
2454 }
2455 }
2456 }
2457 "amd" => {
2458 let amd_always_devices = ["/dev/kfd"];
2460 for dev_path in &amd_always_devices {
2461 if let Ok((major, minor)) = get_device_major_minor(dev_path) {
2462 let dev_type =
2463 get_device_type(dev_path).unwrap_or(LinuxDeviceType::C);
2464 let linux_device = LinuxDeviceBuilder::default()
2465 .path((*dev_path).to_string())
2466 .typ(dev_type)
2467 .major(major)
2468 .minor(minor)
2469 .file_mode(0o666u32)
2470 .uid(0u32)
2471 .gid(0u32)
2472 .build()
2473 .map_err(|e| {
2474 AgentError::InvalidSpec(format!(
2475 "failed to build GPU device {dev_path}: {e}"
2476 ))
2477 })?;
2478 devices.push(linux_device);
2479 } else {
2480 tracing::warn!(
2481 "GPU device {} not found on host, skipping",
2482 dev_path
2483 );
2484 }
2485 }
2486
2487 for i in &indices {
2489 let dev_path = format!("/dev/dri/renderD{}", 128 + i);
2490 if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2491 let dev_type =
2492 get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2493 let linux_device = LinuxDeviceBuilder::default()
2494 .path(dev_path.clone())
2495 .typ(dev_type)
2496 .major(major)
2497 .minor(minor)
2498 .file_mode(0o666u32)
2499 .uid(0u32)
2500 .gid(0u32)
2501 .build()
2502 .map_err(|e| {
2503 AgentError::InvalidSpec(format!(
2504 "failed to build GPU device {dev_path}: {e}"
2505 ))
2506 })?;
2507 devices.push(linux_device);
2508 } else {
2509 tracing::warn!(
2510 "GPU device {} not found on host, skipping",
2511 dev_path
2512 );
2513 }
2514 }
2515
2516 for i in &indices {
2518 let dev_path = format!("/dev/dri/card{i}");
2519 if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2520 let dev_type =
2521 get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2522 let linux_device = LinuxDeviceBuilder::default()
2523 .path(dev_path.clone())
2524 .typ(dev_type)
2525 .major(major)
2526 .minor(minor)
2527 .file_mode(0o666u32)
2528 .uid(0u32)
2529 .gid(0u32)
2530 .build()
2531 .map_err(|e| {
2532 AgentError::InvalidSpec(format!(
2533 "failed to build GPU device {dev_path}: {e}"
2534 ))
2535 })?;
2536 devices.push(linux_device);
2537 } else {
2538 tracing::warn!(
2539 "GPU device {} not found on host, skipping",
2540 dev_path
2541 );
2542 }
2543 }
2544 }
2545 "intel" => {
2546 for i in &indices {
2548 let dev_path = format!("/dev/dri/renderD{}", 128 + i);
2549 if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2550 let dev_type =
2551 get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2552 let linux_device = LinuxDeviceBuilder::default()
2553 .path(dev_path.clone())
2554 .typ(dev_type)
2555 .major(major)
2556 .minor(minor)
2557 .file_mode(0o666u32)
2558 .uid(0u32)
2559 .gid(0u32)
2560 .build()
2561 .map_err(|e| {
2562 AgentError::InvalidSpec(format!(
2563 "failed to build GPU device {dev_path}: {e}"
2564 ))
2565 })?;
2566 devices.push(linux_device);
2567 } else {
2568 tracing::warn!(
2569 "GPU device {} not found on host, skipping",
2570 dev_path
2571 );
2572 }
2573 }
2574
2575 for i in &indices {
2577 let dev_path = format!("/dev/dri/card{i}");
2578 if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2579 let dev_type =
2580 get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2581 let linux_device = LinuxDeviceBuilder::default()
2582 .path(dev_path.clone())
2583 .typ(dev_type)
2584 .major(major)
2585 .minor(minor)
2586 .file_mode(0o666u32)
2587 .uid(0u32)
2588 .gid(0u32)
2589 .build()
2590 .map_err(|e| {
2591 AgentError::InvalidSpec(format!(
2592 "failed to build GPU device {dev_path}: {e}"
2593 ))
2594 })?;
2595 devices.push(linux_device);
2596 } else {
2597 tracing::warn!(
2598 "GPU device {} not found on host, skipping",
2599 dev_path
2600 );
2601 }
2602 }
2603 }
2604 other => {
2605 tracing::warn!(
2607 vendor = %other,
2608 "Unknown GPU vendor, attempting DRI device passthrough"
2609 );
2610 for i in &indices {
2611 let dev_path = format!("/dev/dri/renderD{}", 128 + i);
2612 if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2613 let dev_type =
2614 get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2615 let linux_device = LinuxDeviceBuilder::default()
2616 .path(dev_path.clone())
2617 .typ(dev_type)
2618 .major(major)
2619 .minor(minor)
2620 .file_mode(0o666u32)
2621 .uid(0u32)
2622 .gid(0u32)
2623 .build()
2624 .map_err(|e| {
2625 AgentError::InvalidSpec(format!(
2626 "failed to build GPU device {dev_path}: {e}"
2627 ))
2628 })?;
2629 devices.push(linux_device);
2630 } else {
2631 tracing::warn!(
2632 "GPU device {} not found on host, skipping",
2633 dev_path
2634 );
2635 }
2636 }
2637 }
2638 }
2639 }
2640
2641 Ok(devices)
2642 } }
2644
2645 pub async fn write_config(
2657 &self,
2658 container_id: &ContainerId,
2659 spec: &ServiceSpec,
2660 ) -> Result<PathBuf> {
2661 let oci_spec = self
2663 .build_spec_only(container_id, spec, &self.volume_paths)
2664 .await?;
2665
2666 let config_path = self.bundle_dir.join("config.json");
2668 let config_json =
2669 serde_json::to_string_pretty(&oci_spec).map_err(|e| AgentError::CreateFailed {
2670 id: container_id.to_string(),
2671 reason: format!("failed to serialize OCI spec: {e}"),
2672 })?;
2673
2674 fs::write(&config_path, config_json)
2675 .await
2676 .map_err(|e| AgentError::CreateFailed {
2677 id: container_id.to_string(),
2678 reason: format!("failed to write config.json: {e}"),
2679 })?;
2680
2681 tracing::debug!(
2682 "Wrote OCI config.json at {} for container {}",
2683 config_path.display(),
2684 container_id
2685 );
2686
2687 Ok(self.bundle_dir.clone())
2688 }
2689
2690 fn resolve_command_from_spec(
2699 spec: &ServiceSpec,
2700 image_config: Option<&zlayer_registry::ImageConfig>,
2701 ) -> Vec<String> {
2702 let mut args = Vec::new();
2703
2704 match (&spec.command.entrypoint, &spec.command.args) {
2705 (Some(entrypoint), Some(cmd_args)) => {
2706 args.extend_from_slice(entrypoint);
2707 args.extend_from_slice(cmd_args);
2708 }
2709 (Some(entrypoint), None) => {
2710 args.extend_from_slice(entrypoint);
2711 }
2712 (None, Some(cmd_args)) if !cmd_args.is_empty() => {
2713 args.extend_from_slice(cmd_args);
2714 }
2715 _ => {
2716 if let Some(img_cmd) =
2718 image_config.and_then(zlayer_registry::ImageConfig::full_command)
2719 {
2720 if img_cmd.is_empty() {
2721 args.push("/bin/sh".to_string());
2722 } else {
2723 args.extend(img_cmd);
2724 }
2725 } else {
2726 args.push("/bin/sh".to_string());
2727 }
2728 }
2729 }
2730
2731 args
2732 }
2733
2734 pub async fn cleanup(&self) -> Result<()> {
2741 if self.bundle_dir.exists() {
2742 fs::remove_dir_all(&self.bundle_dir)
2743 .await
2744 .map_err(|e| AgentError::CreateFailed {
2745 id: "cleanup".to_string(),
2746 reason: format!(
2747 "failed to remove bundle directory {}: {}",
2748 self.bundle_dir.display(),
2749 e
2750 ),
2751 })?;
2752 }
2753 Ok(())
2754 }
2755}
2756
2757#[cfg(unix)]
2770pub async fn create_bundle(
2771 container_id: &ContainerId,
2772 spec: &ServiceSpec,
2773 rootfs_path: Option<PathBuf>,
2774) -> Result<PathBuf> {
2775 let mut builder =
2776 BundleBuilder::for_container(container_id).with_host_network(spec.host_network);
2777
2778 if let Some(rootfs) = rootfs_path {
2779 builder = builder.with_rootfs(rootfs);
2780 }
2781
2782 builder.build(container_id, spec).await
2783}
2784
2785pub async fn cleanup_bundle(container_id: &ContainerId) -> Result<()> {
2792 let builder = BundleBuilder::for_container(container_id);
2793 builder.cleanup().await
2794}
2795
2796#[cfg(test)]
2797mod tests {
2798 use super::*;
2799 use zlayer_spec::*;
2800
2801 fn mock_spec() -> ServiceSpec {
2802 serde_yaml::from_str::<DeploymentSpec>(
2803 r"
2804version: v1
2805deployment: test
2806services:
2807 test:
2808 rtype: service
2809 image:
2810 name: test:latest
2811 endpoints:
2812 - name: http
2813 protocol: http
2814 port: 8080
2815",
2816 )
2817 .unwrap()
2818 .services
2819 .remove("test")
2820 .unwrap()
2821 }
2822
2823 #[cfg(target_os = "linux")]
2824 fn mock_spec_with_resources() -> ServiceSpec {
2825 serde_yaml::from_str::<DeploymentSpec>(
2826 r"
2827version: v1
2828deployment: test
2829services:
2830 test:
2831 rtype: service
2832 image:
2833 name: test:latest
2834 resources:
2835 cpu: 0.5
2836 memory: 512Mi
2837 env:
2838 MY_VAR: my_value
2839 ANOTHER: value2
2840 endpoints:
2841 - name: http
2842 protocol: http
2843 port: 8080
2844",
2845 )
2846 .unwrap()
2847 .services
2848 .remove("test")
2849 .unwrap()
2850 }
2851
2852 #[cfg(target_os = "linux")]
2853 fn mock_privileged_spec() -> ServiceSpec {
2854 serde_yaml::from_str::<DeploymentSpec>(
2855 r"
2856version: v1
2857deployment: test
2858services:
2859 test:
2860 rtype: service
2861 image:
2862 name: test:latest
2863 privileged: true
2864 endpoints:
2865 - name: http
2866 protocol: http
2867 port: 8080
2868",
2869 )
2870 .unwrap()
2871 .services
2872 .remove("test")
2873 .unwrap()
2874 }
2875
2876 #[test]
2877 fn test_parse_memory_string() {
2878 assert_eq!(parse_memory_string("512Mi").unwrap(), 512 * 1024 * 1024);
2879 assert_eq!(parse_memory_string("1Gi").unwrap(), 1024 * 1024 * 1024);
2880 assert_eq!(parse_memory_string("2G").unwrap(), 2 * 1000 * 1000 * 1000);
2881 assert_eq!(parse_memory_string("1024").unwrap(), 1024);
2882 assert_eq!(parse_memory_string("512Ki").unwrap(), 512 * 1024);
2883 }
2884
2885 #[test]
2886 fn test_parse_memory_string_errors() {
2887 assert!(parse_memory_string("").is_err());
2888 assert!(parse_memory_string("abc").is_err());
2889 assert!(parse_memory_string("12.5Mi").is_err());
2890 }
2891
2892 #[test]
2893 fn test_generate_resolv_conf_single_nameserver() {
2894 let out = generate_resolv_conf(&["10.42.0.1".to_string()]);
2895 assert_eq!(out, "nameserver 10.42.0.1\noptions edns0\n");
2896 }
2897
2898 #[test]
2899 fn test_generate_resolv_conf_two_nameservers() {
2900 let out = generate_resolv_conf(&["10.42.0.1".to_string(), "fd00::1".to_string()]);
2901 assert_eq!(
2902 out,
2903 "nameserver 10.42.0.1\nnameserver fd00::1\noptions edns0\n"
2904 );
2905 }
2906
2907 #[cfg(target_os = "linux")]
2908 #[tokio::test]
2909 async fn test_build_oci_spec_injects_resolv_conf_mount() {
2910 let dir = tempfile::tempdir().unwrap();
2911 let id = ContainerId::new("test".to_string(), 1);
2912 let mut spec = mock_spec();
2913 spec.dns = vec!["10.42.0.1".to_string()];
2914 let builder = BundleBuilder::new(dir.path().to_path_buf());
2915
2916 let oci_spec = builder
2917 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
2918 .await
2919 .unwrap();
2920
2921 let mounts = oci_spec.mounts().as_ref().expect("mounts present");
2922 let resolv_mount = mounts
2923 .iter()
2924 .find(|m| m.destination() == Path::new("/etc/resolv.conf"))
2925 .expect("resolv.conf mount injected");
2926 let source = resolv_mount.source().as_ref().unwrap();
2927 let written = std::fs::read_to_string(source).unwrap();
2928 assert_eq!(written, "nameserver 10.42.0.1\noptions edns0\n");
2929 }
2930
2931 #[cfg(target_os = "linux")]
2932 #[tokio::test]
2933 async fn test_build_oci_spec_no_resolv_conf_when_dns_empty() {
2934 let dir = tempfile::tempdir().unwrap();
2935 let id = ContainerId::new("test".to_string(), 1);
2936 let spec = mock_spec(); let builder = BundleBuilder::new(dir.path().to_path_buf());
2938
2939 let oci_spec = builder
2940 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
2941 .await
2942 .unwrap();
2943
2944 let mounts = oci_spec.mounts().as_ref().expect("mounts present");
2945 assert!(
2946 !mounts
2947 .iter()
2948 .any(|m| m.destination() == Path::new("/etc/resolv.conf")),
2949 "no resolv.conf mount should be injected for empty spec.dns"
2950 );
2951 }
2952
2953 #[cfg(target_os = "linux")]
2954 #[tokio::test]
2955 async fn test_build_oci_spec_no_resolv_conf_when_host_network() {
2956 let dir = tempfile::tempdir().unwrap();
2957 let id = ContainerId::new("test".to_string(), 1);
2958 let mut spec = mock_spec();
2959 spec.dns = vec!["10.42.0.1".to_string()];
2960 spec.host_network = true;
2961 let builder = BundleBuilder::new(dir.path().to_path_buf());
2962
2963 let oci_spec = builder
2964 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
2965 .await
2966 .unwrap();
2967
2968 let mounts = oci_spec.mounts().as_ref().expect("mounts present");
2969 assert!(
2970 !mounts
2971 .iter()
2972 .any(|m| m.destination() == Path::new("/etc/resolv.conf")),
2973 "host_network containers must inherit the host resolv.conf"
2974 );
2975 }
2976
2977 #[test]
2978 fn test_bundle_builder_new() {
2979 let builder = BundleBuilder::new("/tmp/test-bundle".into());
2980 assert_eq!(builder.bundle_dir(), Path::new("/tmp/test-bundle"));
2981 assert!(builder.rootfs_path.is_none());
2982 }
2983
2984 #[test]
2985 fn test_bundle_builder_for_container() {
2986 let dirs = zlayer_paths::ZLayerDirs::system_default();
2987 let id = ContainerId::new("myservice".to_string(), 1);
2988 let builder = BundleBuilder::for_container(&id);
2989 assert_eq!(builder.bundle_dir(), dirs.bundles().join("myservice-rep-1"));
2990 }
2991
2992 #[test]
2993 fn test_bundle_builder_with_rootfs() {
2994 let dirs = zlayer_paths::ZLayerDirs::system_default();
2995 let builder = BundleBuilder::new("/tmp/test-bundle".into())
2996 .with_rootfs(dirs.rootfs().join("myimage"));
2997 assert_eq!(builder.rootfs_path, Some(dirs.rootfs().join("myimage")));
2998 }
2999
3000 #[cfg(target_os = "linux")]
3001 #[tokio::test]
3002 async fn test_build_oci_spec_basic() {
3003 let id = ContainerId::new("test".to_string(), 1);
3004 let spec = mock_spec();
3005 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3006
3007 let oci_spec = builder
3008 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3009 .await
3010 .unwrap();
3011
3012 assert_eq!(oci_spec.version(), "1.0.2");
3013 assert!(oci_spec.root().is_some());
3014 assert_eq!(
3015 oci_spec.root().as_ref().unwrap().path(),
3016 std::path::Path::new("rootfs")
3017 );
3018 assert!(oci_spec.process().is_some());
3019 assert!(oci_spec.linux().is_some());
3020 }
3021
3022 #[cfg(target_os = "linux")]
3023 #[tokio::test]
3024 async fn test_build_oci_spec_with_resources() {
3025 let id = ContainerId::new("test".to_string(), 1);
3026 let spec = mock_spec_with_resources();
3027 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3028
3029 let oci_spec = builder
3030 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3031 .await
3032 .unwrap();
3033
3034 let linux = oci_spec.linux().as_ref().unwrap();
3036 let resources = linux.resources().as_ref().unwrap();
3037
3038 let cpu = resources.cpu().as_ref().unwrap();
3040 assert_eq!(cpu.quota(), Some(50_000)); assert_eq!(cpu.period(), Some(100_000));
3042
3043 let memory = resources.memory().as_ref().unwrap();
3045 assert_eq!(memory.limit(), Some(512 * 1024 * 1024)); }
3047
3048 #[cfg(target_os = "linux")]
3049 #[tokio::test]
3050 async fn test_build_oci_spec_privileged() {
3051 let id = ContainerId::new("test".to_string(), 1);
3052 let spec = mock_privileged_spec();
3053 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3054
3055 let oci_spec = builder
3056 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3057 .await
3058 .unwrap();
3059
3060 let process = oci_spec.process().as_ref().unwrap();
3062 let caps = process.capabilities().as_ref().unwrap();
3063 let bounding = caps.bounding().as_ref().unwrap();
3064
3065 assert!(bounding.contains(&Capability::SysAdmin));
3067 assert!(bounding.contains(&Capability::NetAdmin));
3068
3069 let linux = oci_spec.linux().as_ref().unwrap();
3071 assert!(
3072 linux.masked_paths().is_none() || linux.masked_paths().as_ref().unwrap().is_empty()
3073 );
3074 }
3075
3076 #[cfg(target_os = "linux")]
3077 #[tokio::test]
3078 async fn test_build_oci_spec_environment() {
3079 let id = ContainerId::new("test".to_string(), 1);
3080 let spec = mock_spec_with_resources();
3081 let builder = BundleBuilder::new("/tmp/test-bundle".into())
3082 .with_env("EXTRA_VAR".to_string(), "extra_value".to_string());
3083
3084 let oci_spec = builder
3085 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3086 .await
3087 .unwrap();
3088
3089 let process = oci_spec.process().as_ref().unwrap();
3090 let env = process.env().as_ref().unwrap();
3091
3092 assert!(env.iter().any(|e| e == "MY_VAR=my_value"));
3094 assert!(env.iter().any(|e| e == "ANOTHER=value2"));
3095 assert!(env.iter().any(|e| e == "EXTRA_VAR=extra_value"));
3097 assert!(env.iter().any(|e| e.starts_with("PATH=")));
3099 }
3100
3101 #[cfg(target_os = "linux")]
3102 #[tokio::test]
3103 async fn test_build_namespaces() {
3104 let id = ContainerId::new("test".to_string(), 1);
3105 let spec = mock_spec();
3106 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3107
3108 let oci_spec = builder
3109 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3110 .await
3111 .unwrap();
3112 let linux = oci_spec.linux().as_ref().unwrap();
3113 let namespaces = linux.namespaces().as_ref().unwrap();
3114
3115 let namespace_types: Vec<_> = namespaces
3117 .iter()
3118 .map(oci_spec::runtime::LinuxNamespace::typ)
3119 .collect();
3120 assert!(namespace_types.contains(&LinuxNamespaceType::Pid));
3121 assert!(namespace_types.contains(&LinuxNamespaceType::Ipc));
3122 assert!(namespace_types.contains(&LinuxNamespaceType::Uts));
3123 assert!(namespace_types.contains(&LinuxNamespaceType::Mount));
3124 assert!(namespace_types.contains(&LinuxNamespaceType::Network));
3125 }
3126
3127 #[cfg(target_os = "linux")]
3128 #[tokio::test]
3129 async fn test_build_namespaces_host_network() {
3130 let id = ContainerId::new("test".to_string(), 1);
3131 let spec = mock_spec();
3132 let builder = BundleBuilder::new("/tmp/test-bundle".into()).with_host_network(true);
3133
3134 let oci_spec = builder
3135 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3136 .await
3137 .unwrap();
3138 let linux = oci_spec.linux().as_ref().unwrap();
3139 let namespaces = linux.namespaces().as_ref().unwrap();
3140
3141 let namespace_types: Vec<_> = namespaces
3143 .iter()
3144 .map(oci_spec::runtime::LinuxNamespace::typ)
3145 .collect();
3146 assert!(namespace_types.contains(&LinuxNamespaceType::Pid));
3147 assert!(namespace_types.contains(&LinuxNamespaceType::Ipc));
3148 assert!(namespace_types.contains(&LinuxNamespaceType::Uts));
3149 assert!(namespace_types.contains(&LinuxNamespaceType::Mount));
3150 assert!(
3151 !namespace_types.contains(&LinuxNamespaceType::Network),
3152 "Network namespace should NOT be present in host_network mode"
3153 );
3154 }
3155
3156 #[test]
3157 fn test_build_default_mounts() {
3158 let spec = mock_spec();
3159 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3160
3161 let mounts = builder.build_default_mounts(&spec).unwrap();
3162
3163 let mount_destinations: Vec<_> = mounts
3165 .iter()
3166 .map(|m| m.destination().to_string_lossy().to_string())
3167 .collect();
3168 assert!(mount_destinations.contains(&"/proc".to_string()));
3169 assert!(mount_destinations.contains(&"/dev".to_string()));
3170 assert!(mount_destinations.contains(&"/dev/pts".to_string()));
3171 assert!(mount_destinations.contains(&"/dev/shm".to_string()));
3172 assert!(mount_destinations.contains(&"/sys".to_string()));
3173 }
3174
3175 #[test]
3176 fn test_build_storage_mounts_bind() {
3177 let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3178 r"
3179version: v1
3180deployment: test
3181services:
3182 test:
3183 image:
3184 name: test:latest
3185 storage:
3186 - type: bind
3187 source: /host/data
3188 target: /app/data
3189 readonly: true
3190",
3191 )
3192 .unwrap()
3193 .services
3194 .remove("test")
3195 .unwrap();
3196
3197 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3198 let volume_paths = std::collections::HashMap::new();
3199
3200 let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3201
3202 assert_eq!(mounts.len(), 1);
3203 assert_eq!(mounts[0].destination().to_string_lossy(), "/app/data");
3204 assert_eq!(
3205 mounts[0]
3206 .source()
3207 .as_ref()
3208 .map(|s| s.to_string_lossy().to_string()),
3209 Some("/host/data".to_string())
3210 );
3211 let options = mounts[0].options().as_ref().unwrap();
3212 assert!(options.contains(&"rbind".to_string()));
3213 assert!(options.contains(&"ro".to_string()));
3214 }
3215
3216 #[test]
3217 fn test_build_storage_mounts_named() {
3218 let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3219 r"
3220version: v1
3221deployment: test
3222services:
3223 test:
3224 image:
3225 name: test:latest
3226 storage:
3227 - type: named
3228 name: my-volume
3229 target: /app/data
3230",
3231 )
3232 .unwrap()
3233 .services
3234 .remove("test")
3235 .unwrap();
3236
3237 let dirs = zlayer_paths::ZLayerDirs::system_default();
3238 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3239 let mut volume_paths = std::collections::HashMap::new();
3240 volume_paths.insert("my-volume".to_string(), dirs.volumes().join("my-volume"));
3241
3242 let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3243
3244 assert_eq!(mounts.len(), 1);
3245 assert_eq!(mounts[0].destination().to_string_lossy(), "/app/data");
3246 assert_eq!(
3247 mounts[0]
3248 .source()
3249 .as_ref()
3250 .map(|s| s.to_string_lossy().to_string()),
3251 Some(
3252 dirs.volumes()
3253 .join("my-volume")
3254 .to_string_lossy()
3255 .into_owned()
3256 )
3257 );
3258 }
3259
3260 #[test]
3261 fn test_build_storage_mounts_tmpfs() {
3262 let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3263 r"
3264version: v1
3265deployment: test
3266services:
3267 test:
3268 image:
3269 name: test:latest
3270 storage:
3271 - type: tmpfs
3272 target: /app/tmp
3273 size: 256Mi
3274 mode: 1777
3275",
3276 )
3277 .unwrap()
3278 .services
3279 .remove("test")
3280 .unwrap();
3281
3282 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3283 let volume_paths = std::collections::HashMap::new();
3284
3285 let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3286
3287 assert_eq!(mounts.len(), 1);
3288 assert_eq!(mounts[0].destination().to_string_lossy(), "/app/tmp");
3289 assert_eq!(mounts[0].typ().as_ref().map(String::as_str), Some("tmpfs"));
3290 let options = mounts[0].options().as_ref().unwrap();
3291 assert!(options.iter().any(|o| o.starts_with("size=")));
3292 assert!(options.iter().any(|o| o.starts_with("mode=")));
3293 }
3294
3295 #[test]
3296 fn test_build_storage_mounts_multiple() {
3297 let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3298 r"
3299version: v1
3300deployment: test
3301services:
3302 test:
3303 image:
3304 name: test:latest
3305 storage:
3306 - type: bind
3307 source: /etc/config
3308 target: /app/config
3309 readonly: true
3310 - type: named
3311 name: app-data
3312 target: /app/data
3313 - type: tmpfs
3314 target: /app/tmp
3315",
3316 )
3317 .unwrap()
3318 .services
3319 .remove("test")
3320 .unwrap();
3321
3322 let dirs = zlayer_paths::ZLayerDirs::system_default();
3323 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3324 let mut volume_paths = std::collections::HashMap::new();
3325 volume_paths.insert("app-data".to_string(), dirs.volumes().join("app-data"));
3326
3327 let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3328
3329 assert_eq!(mounts.len(), 3);
3330
3331 let destinations: Vec<String> = mounts
3333 .iter()
3334 .map(|m| m.destination().to_string_lossy().to_string())
3335 .collect();
3336 assert!(destinations.contains(&"/app/config".to_string()));
3337 assert!(destinations.contains(&"/app/data".to_string()));
3338 assert!(destinations.contains(&"/app/tmp".to_string()));
3339 }
3340
3341 #[test]
3342 fn test_build_storage_mounts_anonymous_missing_path() {
3343 let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3344 r"
3345version: v1
3346deployment: test
3347services:
3348 test:
3349 image:
3350 name: test:latest
3351 storage:
3352 - type: anonymous
3353 target: /app/cache
3354",
3355 )
3356 .unwrap()
3357 .services
3358 .remove("test")
3359 .unwrap();
3360
3361 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3362 let volume_paths = std::collections::HashMap::new(); let result = builder.build_storage_mounts(&spec, &volume_paths);
3365
3366 assert!(result.is_err());
3368 }
3369
3370 #[cfg(target_os = "linux")]
3371 #[tokio::test]
3372 async fn test_oci_spec_includes_storage_mounts() {
3373 let id = ContainerId::new("test".to_string(), 1);
3374 let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3375 r"
3376version: v1
3377deployment: test
3378services:
3379 test:
3380 image:
3381 name: test:latest
3382 storage:
3383 - type: bind
3384 source: /host/data
3385 target: /app/data
3386 - type: tmpfs
3387 target: /app/tmp
3388",
3389 )
3390 .unwrap()
3391 .services
3392 .remove("test")
3393 .unwrap();
3394
3395 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3396 let volume_paths = std::collections::HashMap::new();
3397
3398 let oci_spec = builder
3399 .build_spec_only(&id, &spec, &volume_paths)
3400 .await
3401 .unwrap();
3402
3403 let mounts = oci_spec.mounts().as_ref().unwrap();
3405 let destinations: Vec<String> = mounts
3406 .iter()
3407 .map(|m| m.destination().to_string_lossy().to_string())
3408 .collect();
3409
3410 assert!(destinations.contains(&"/proc".to_string())); assert!(destinations.contains(&"/dev".to_string())); assert!(destinations.contains(&"/app/data".to_string())); assert!(destinations.contains(&"/app/tmp".to_string())); }
3416
3417 fn mock_gpu_spec(vendor: &str, count: u32) -> ServiceSpec {
3418 let yaml = format!(
3419 "
3420version: v1
3421deployment: test
3422services:
3423 test:
3424 rtype: service
3425 image:
3426 name: test:latest
3427 resources:
3428 gpu:
3429 count: {count}
3430 vendor: {vendor}
3431 endpoints:
3432 - name: http
3433 protocol: http
3434 port: 8080
3435"
3436 );
3437 serde_yaml::from_str::<DeploymentSpec>(&yaml)
3438 .unwrap()
3439 .services
3440 .remove("test")
3441 .unwrap()
3442 }
3443
3444 fn write_nvidia_cdi_fixture(dir: &std::path::Path, json: &str) {
3445 std::fs::write(dir.join("nvidia.json"), json).unwrap();
3446 }
3447
3448 fn nvidia_cdi_fixture() -> &'static str {
3449 r#"{
3450 "cdiVersion": "0.6.0",
3451 "kind": "nvidia.com/gpu",
3452 "devices": [{
3453 "name": "0",
3454 "containerEdits": {
3455 "deviceNodes": [
3456 {"path": "/dev/nvidia0", "type": "c", "major": 195, "minor": 0}
3457 ],
3458 "env": ["NVIDIA_VISIBLE_DEVICES=0"],
3459 "hooks": {
3460 "createContainer": [{
3461 "path": "/usr/bin/nvidia-container-runtime-hook",
3462 "args": ["nvidia-container-runtime-hook", "prestart"]
3463 }]
3464 }
3465 }
3466 }]
3467 }"#
3468 }
3469
3470 #[cfg(target_os = "linux")]
3471 #[tokio::test]
3472 async fn gpu_spec_translates_to_cdi_device_nodes() {
3473 let dir = tempfile::tempdir().unwrap();
3474 write_nvidia_cdi_fixture(dir.path(), nvidia_cdi_fixture());
3475 let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3476
3477 let id = ContainerId::new("test".to_string(), 1);
3478 let spec = mock_gpu_spec("nvidia", 1);
3479 let builder = BundleBuilder::new("/tmp/test-bundle-cdi".into()).with_cdi_registry(registry);
3480
3481 let oci_spec = builder
3482 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3483 .await
3484 .expect("build with CDI fixture");
3485
3486 let linux = oci_spec.linux().as_ref().expect("linux config present");
3488 let devices = linux.devices().as_ref().expect("devices present");
3489 assert!(
3490 devices
3491 .iter()
3492 .any(|d| d.path() == std::path::Path::new("/dev/nvidia0")),
3493 "expected /dev/nvidia0 from CDI fixture; got {:?}",
3494 devices
3495 .iter()
3496 .map(oci_spec::runtime::LinuxDevice::path)
3497 .collect::<Vec<_>>()
3498 );
3499
3500 let process = oci_spec.process().as_ref().expect("process present");
3502 let env = process.env().as_ref().expect("env present");
3503 assert!(
3504 env.iter().any(|e| e == "NVIDIA_VISIBLE_DEVICES=0"),
3505 "expected NVIDIA_VISIBLE_DEVICES=0 in env; got {env:?}"
3506 );
3507
3508 let hooks = oci_spec.hooks().as_ref().expect("hooks present");
3510 let create_container = hooks
3511 .create_container()
3512 .as_ref()
3513 .expect("createContainer hooks present");
3514 assert_eq!(create_container.len(), 1);
3515 assert_eq!(
3516 create_container[0].path(),
3517 &std::path::PathBuf::from("/usr/bin/nvidia-container-runtime-hook")
3518 );
3519 }
3520
3521 #[tokio::test]
3522 async fn gpu_spec_with_missing_cdi_returns_error() {
3523 let dir = tempfile::tempdir().unwrap();
3525 let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3526
3527 let id = ContainerId::new("test".to_string(), 1);
3528 let spec = mock_gpu_spec("nvidia", 1);
3529 let builder =
3530 BundleBuilder::new("/tmp/test-bundle-cdi-missing".into()).with_cdi_registry(registry);
3531
3532 let err = builder
3533 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3534 .await
3535 .expect_err("should fail when CDI registry is empty");
3536
3537 match err {
3538 AgentError::InvalidSpec(msg) => {
3539 assert!(
3540 msg.contains("nvidia") || msg.contains("CDI"),
3541 "error should mention CDI / vendor; got: {msg}"
3542 );
3543 }
3544 other => panic!("expected InvalidSpec, got {other:?}"),
3545 }
3546 }
3547
3548 #[tokio::test]
3549 async fn gpu_spec_with_unknown_device_returns_error() {
3550 let dir = tempfile::tempdir().unwrap();
3553 write_nvidia_cdi_fixture(dir.path(), nvidia_cdi_fixture());
3554 let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3555
3556 let id = ContainerId::new("test".to_string(), 1);
3557 let spec = mock_gpu_spec("nvidia", 2);
3558 let builder =
3559 BundleBuilder::new("/tmp/test-bundle-cdi-unknown".into()).with_cdi_registry(registry);
3560
3561 let err = builder
3562 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3563 .await
3564 .expect_err("should fail when device '1' is not declared");
3565 match err {
3566 AgentError::InvalidSpec(msg) => {
3567 assert!(
3568 msg.contains("'1'") || msg.contains("device"),
3569 "error should mention the missing device; got: {msg}"
3570 );
3571 }
3572 other => panic!("expected InvalidSpec, got {other:?}"),
3573 }
3574 }
3575
3576 #[cfg(target_os = "linux")]
3577 #[tokio::test]
3578 async fn gpu_spec_with_all_devices_expands_to_all_in_spec() {
3579 let dir = tempfile::tempdir().unwrap();
3581 let fixture = r#"{
3582 "cdiVersion": "0.6.0",
3583 "kind": "nvidia.com/gpu",
3584 "devices": [
3585 {
3586 "name": "0",
3587 "containerEdits": {
3588 "env": ["NVIDIA_VISIBLE_DEVICES=0"],
3589 "deviceNodes": [
3590 {"path": "/dev/nvidia0", "type": "c", "major": 195, "minor": 0}
3591 ]
3592 }
3593 },
3594 {
3595 "name": "1",
3596 "containerEdits": {
3597 "env": ["NVIDIA_VISIBLE_DEVICES=1"],
3598 "deviceNodes": [
3599 {"path": "/dev/nvidia1", "type": "c", "major": 195, "minor": 1}
3600 ]
3601 }
3602 }
3603 ]
3604 }"#;
3605 write_nvidia_cdi_fixture(dir.path(), fixture);
3606 let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3607
3608 let edits = registry
3611 .resolve_for_kind("nvidia.com/gpu", &["all".to_string()])
3612 .expect("resolve all");
3613 assert_eq!(edits.len(), 2);
3614
3615 let id = ContainerId::new("test".to_string(), 1);
3618 let spec = mock_gpu_spec("nvidia", 2);
3619 let builder =
3620 BundleBuilder::new("/tmp/test-bundle-cdi-all".into()).with_cdi_registry(registry);
3621
3622 let oci_spec = builder
3623 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3624 .await
3625 .expect("build with 2-device fixture");
3626
3627 let devices = oci_spec
3628 .linux()
3629 .as_ref()
3630 .unwrap()
3631 .devices()
3632 .as_ref()
3633 .expect("devices present");
3634 let paths: Vec<_> = devices.iter().map(|d| d.path().clone()).collect();
3635 assert!(paths.contains(&std::path::PathBuf::from("/dev/nvidia0")));
3636 assert!(paths.contains(&std::path::PathBuf::from("/dev/nvidia1")));
3637 }
3638
3639 fn build_nvidia_cdi_registry(dir: &std::path::Path) -> std::sync::Arc<crate::cdi::CdiRegistry> {
3644 write_nvidia_cdi_fixture(dir, nvidia_cdi_fixture());
3645 std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir]))
3646 }
3647
3648 #[cfg(target_os = "linux")]
3649 #[tokio::test]
3650 async fn gpu_spec_with_mps_sharing_injects_env_and_mounts() {
3651 let cdi_dir = tempfile::tempdir().unwrap();
3655 let mps_root = tempfile::tempdir().unwrap();
3656 let pipe_dir = mps_root.path().join("nvidia-mps");
3657 let log_dir = mps_root.path().join("nvidia-log");
3658 std::fs::create_dir(&pipe_dir).unwrap();
3659 std::fs::create_dir(&log_dir).unwrap();
3660 let registry = build_nvidia_cdi_registry(cdi_dir.path());
3661
3662 let id = ContainerId::new("test".to_string(), 1);
3663 let mut spec = mock_gpu_spec("nvidia", 1);
3664 let gpu = spec.resources.gpu.as_mut().expect("gpu spec set");
3665 gpu.sharing = Some(zlayer_spec::GpuSharingMode::Mps);
3666 gpu.mps_pipe_dir = Some(pipe_dir.to_string_lossy().into_owned());
3667 gpu.mps_log_dir = Some(log_dir.to_string_lossy().into_owned());
3668
3669 let builder =
3670 BundleBuilder::new("/tmp/test-bundle-mps-env".into()).with_cdi_registry(registry);
3671 let oci_spec = builder
3672 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3673 .await
3674 .expect("build with MPS sharing");
3675
3676 let env = oci_spec
3677 .process()
3678 .as_ref()
3679 .and_then(|p| p.env().as_ref())
3680 .expect("env present");
3681 let pipe_expect = format!("CUDA_MPS_PIPE_DIRECTORY={}", pipe_dir.display());
3682 let log_expect = format!("CUDA_MPS_LOG_DIRECTORY={}", log_dir.display());
3683 assert!(
3684 env.iter().any(|e| e == &pipe_expect),
3685 "expected {pipe_expect} in env; got {env:?}"
3686 );
3687 assert!(
3688 env.iter().any(|e| e == &log_expect),
3689 "expected {log_expect} in env; got {env:?}"
3690 );
3691
3692 let mounts = oci_spec.mounts().as_ref().expect("mounts present");
3693 assert!(
3694 mounts
3695 .iter()
3696 .any(|m| m.destination() == &pipe_dir && m.source().as_ref() == Some(&pipe_dir)),
3697 "expected bind mount of MPS pipe dir {}; got destinations {:?}",
3698 pipe_dir.display(),
3699 mounts.iter().map(Mount::destination).collect::<Vec<_>>()
3700 );
3701 assert!(
3702 mounts
3703 .iter()
3704 .any(|m| m.destination() == &log_dir && m.source().as_ref() == Some(&log_dir)),
3705 "expected bind mount of MPS log dir {}",
3706 log_dir.display()
3707 );
3708 }
3709
3710 #[tokio::test]
3711 async fn gpu_spec_with_mps_sharing_fails_when_pipe_dir_missing() {
3712 let cdi_dir = tempfile::tempdir().unwrap();
3713 let registry = build_nvidia_cdi_registry(cdi_dir.path());
3714
3715 let id = ContainerId::new("test".to_string(), 1);
3716 let mut spec = mock_gpu_spec("nvidia", 1);
3717 let gpu = spec.resources.gpu.as_mut().expect("gpu spec set");
3718 gpu.sharing = Some(zlayer_spec::GpuSharingMode::Mps);
3719 let missing = tempfile::tempdir().unwrap();
3722 let missing_path = missing.path().join("definitely-not-here");
3723 gpu.mps_pipe_dir = Some(missing_path.to_string_lossy().into_owned());
3724
3725 let builder =
3726 BundleBuilder::new("/tmp/test-bundle-mps-missing".into()).with_cdi_registry(registry);
3727 let err = builder
3728 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3729 .await
3730 .expect_err("should fail when MPS pipe dir is missing");
3731 match err {
3732 AgentError::GpuSharingUnavailable { mode, reason } => {
3733 assert_eq!(mode, "mps");
3734 assert!(
3735 reason.contains("pipe") || reason.contains(&missing_path.display().to_string()),
3736 "reason should mention the missing path; got: {reason}"
3737 );
3738 }
3739 other => panic!("expected GpuSharingUnavailable, got {other:?}"),
3740 }
3741 }
3742
3743 #[cfg(target_os = "linux")]
3744 #[tokio::test]
3745 async fn gpu_spec_with_timeslicing_injects_visible_devices() {
3746 let cdi_dir = tempfile::tempdir().unwrap();
3747 let registry = build_nvidia_cdi_registry(cdi_dir.path());
3748
3749 let id = ContainerId::new("test".to_string(), 1);
3750 let mut spec = mock_gpu_spec("nvidia", 1);
3751 let gpu = spec.resources.gpu.as_mut().expect("gpu spec set");
3752 gpu.sharing = Some(zlayer_spec::GpuSharingMode::TimeSlice);
3753 gpu.time_slice_index = Some(2);
3754
3755 let builder =
3756 BundleBuilder::new("/tmp/test-bundle-timeslice".into()).with_cdi_registry(registry);
3757 let oci_spec = builder
3758 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3759 .await
3760 .expect("build with time-slicing");
3761
3762 let env = oci_spec
3763 .process()
3764 .as_ref()
3765 .and_then(|p| p.env().as_ref())
3766 .expect("env present");
3767 let cuda_entries: Vec<&String> = env
3770 .iter()
3771 .filter(|e| e.starts_with("CUDA_VISIBLE_DEVICES="))
3772 .collect();
3773 assert_eq!(
3774 cuda_entries.len(),
3775 1,
3776 "exactly one CUDA_VISIBLE_DEVICES expected; got {cuda_entries:?}"
3777 );
3778 assert_eq!(cuda_entries[0], "CUDA_VISIBLE_DEVICES=2");
3779 }
3780
3781 #[cfg(target_os = "linux")]
3782 #[tokio::test]
3783 async fn gpu_spec_without_sharing_omits_mps_env() {
3784 let cdi_dir = tempfile::tempdir().unwrap();
3785 let registry = build_nvidia_cdi_registry(cdi_dir.path());
3786
3787 let id = ContainerId::new("test".to_string(), 1);
3788 let spec = mock_gpu_spec("nvidia", 1);
3789 assert!(spec.resources.gpu.as_ref().unwrap().sharing.is_none());
3790
3791 let builder =
3792 BundleBuilder::new("/tmp/test-bundle-no-sharing".into()).with_cdi_registry(registry);
3793 let oci_spec = builder
3794 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3795 .await
3796 .expect("build without sharing");
3797
3798 let env = oci_spec
3799 .process()
3800 .as_ref()
3801 .and_then(|p| p.env().as_ref())
3802 .expect("env present");
3803 assert!(
3804 !env.iter().any(|e| e.starts_with("CUDA_MPS_")),
3805 "no CUDA_MPS_* env should be present without sharing; got {env:?}"
3806 );
3807
3808 let mounts = oci_spec.mounts().as_ref().expect("mounts present");
3812 assert!(
3813 !mounts
3814 .iter()
3815 .any(|m| { m.destination().to_string_lossy().contains("nvidia-mps") }),
3816 "no MPS pipe mount should be present without sharing"
3817 );
3818 }
3819
3820 #[cfg(unix)]
3821 mod subid_tests {
3822 use super::super::read_subid_range;
3823 use std::io::Write;
3824
3825 #[test]
3826 fn read_subid_range_returns_range_for_user() {
3827 let mut tmp = tempfile::NamedTempFile::new().unwrap();
3828 writeln!(tmp, "alice:100000:65536").unwrap();
3829 writeln!(tmp, "bob:165536:65536").unwrap();
3830 tmp.flush().unwrap();
3831 let path = tmp.path().to_str().unwrap();
3832 assert_eq!(read_subid_range(path, "bob"), Some((165_536, 65_536)));
3833 assert_eq!(read_subid_range(path, "alice"), Some((100_000, 65_536)));
3834 }
3835
3836 #[test]
3837 fn read_subid_range_returns_none_for_unknown_user() {
3838 let mut tmp = tempfile::NamedTempFile::new().unwrap();
3839 writeln!(tmp, "alice:100000:65536").unwrap();
3840 tmp.flush().unwrap();
3841 assert_eq!(
3842 read_subid_range(tmp.path().to_str().unwrap(), "carol"),
3843 None
3844 );
3845 }
3846
3847 #[test]
3848 fn read_subid_range_returns_none_on_missing_file() {
3849 assert_eq!(
3850 read_subid_range("/this/path/does/not/exist/subuid", "anyone"),
3851 None
3852 );
3853 }
3854 }
3855}