1use crate::cdi::{self, CdiContainerEdits, CdiRegistry};
9use crate::error::{AgentError, Result};
10use crate::runtime::ContainerId;
11use oci_spec::runtime::{
12 Capability, Hook, HookBuilder, Hooks, HooksBuilder, LinuxBuilder, LinuxCapabilitiesBuilder,
13 LinuxCpuBuilder, LinuxDeviceBuilder, LinuxDeviceCgroupBuilder, LinuxDeviceType,
14 LinuxMemoryBuilder, LinuxNamespaceBuilder, LinuxNamespaceType, LinuxResourcesBuilder, Mount,
15 MountBuilder, PosixRlimit, PosixRlimitBuilder, PosixRlimitType, ProcessBuilder, RootBuilder,
16 Spec, SpecBuilder, UserBuilder,
17};
18#[cfg(unix)]
21use oci_spec::runtime::LinuxIdMappingBuilder;
22use std::collections::{HashMap, HashSet};
23use std::path::{Path, PathBuf};
32use std::str::FromStr;
33use std::sync::Arc;
34use tokio::fs;
35use zlayer_secrets::SecretsProvider;
36use zlayer_spec::{GpuSharingMode, ServiceSpec, StorageSpec, StorageTier};
37
38const DEFAULT_MPS_PIPE_DIR: &str = "/tmp/nvidia-mps";
41
42const DEFAULT_MPS_LOG_DIR: &str = "/tmp/nvidia-log";
45
46const TIMESLICE_CONFIG_CONTAINER_PATH: &str = "/etc/nvidia/gpu-time-slicing.yaml";
51
52struct MpsDirs {
58 pipe_dir: PathBuf,
59 log_dir: PathBuf,
60}
61
62fn resolve_mps_dirs(gpu: &zlayer_spec::GpuSpec) -> Result<Option<MpsDirs>> {
72 if gpu.sharing != Some(GpuSharingMode::Mps) {
73 return Ok(None);
74 }
75
76 let pipe_dir = PathBuf::from(gpu.mps_pipe_dir.as_deref().unwrap_or(DEFAULT_MPS_PIPE_DIR));
77 let log_dir = PathBuf::from(gpu.mps_log_dir.as_deref().unwrap_or(DEFAULT_MPS_LOG_DIR));
78
79 if !pipe_dir.is_dir() {
80 return Err(AgentError::GpuSharingUnavailable {
81 mode: "mps".to_string(),
82 reason: format!(
83 "MPS pipe directory {} does not exist; ensure nvidia-cuda-mps-control is running",
84 pipe_dir.display()
85 ),
86 });
87 }
88 if !log_dir.is_dir() {
89 return Err(AgentError::GpuSharingUnavailable {
90 mode: "mps".to_string(),
91 reason: format!(
92 "MPS log directory {} does not exist; ensure nvidia-cuda-mps-control is running",
93 log_dir.display()
94 ),
95 });
96 }
97
98 Ok(Some(MpsDirs { pipe_dir, log_dir }))
99}
100
101fn cdi_node_to_oci_device(
110 node: &crate::cdi::CdiDeviceNode,
111) -> Result<oci_spec::runtime::LinuxDevice> {
112 let host_path = node.host_path.as_deref().unwrap_or(&node.path);
113
114 let dev_type = match node.device_type.as_deref() {
115 Some("c" | "u") => LinuxDeviceType::C,
116 Some("b") => LinuxDeviceType::B,
117 Some("p") => LinuxDeviceType::P,
118 _ => get_device_type(host_path).unwrap_or(LinuxDeviceType::C),
119 };
120
121 let (major, minor) = if let (Some(maj), Some(min)) = (node.major, node.minor) {
122 (maj, min)
123 } else {
124 get_device_major_minor(host_path).unwrap_or((0, 0))
125 };
126
127 let mut builder = LinuxDeviceBuilder::default()
128 .path(node.path.clone())
129 .typ(dev_type)
130 .major(major)
131 .minor(minor);
132 if let Some(mode) = node.file_mode {
133 builder = builder.file_mode(mode);
134 } else {
135 builder = builder.file_mode(0o666u32);
136 }
137 builder = builder.uid(node.uid.unwrap_or(0));
138 builder = builder.gid(node.gid.unwrap_or(0));
139
140 builder.build().map_err(|e| {
141 AgentError::InvalidSpec(format!(
142 "failed to build CDI device {path}: {e}",
143 path = node.path
144 ))
145 })
146}
147
148fn convert_cdi_hook(cdi_hook: &crate::cdi::CdiHook) -> Result<Hook> {
150 let mut builder = HookBuilder::default().path(PathBuf::from(&cdi_hook.path));
151 if !cdi_hook.args.is_empty() {
152 builder = builder.args(cdi_hook.args.clone());
153 }
154 if !cdi_hook.env.is_empty() {
155 builder = builder.env(cdi_hook.env.clone());
156 }
157 builder
158 .build()
159 .map_err(|e| AgentError::InvalidSpec(format!("failed to build CDI hook: {e}")))
160}
161
162const ALL_CAPABILITIES: &[Capability] = &[
164 Capability::AuditControl,
165 Capability::AuditRead,
166 Capability::AuditWrite,
167 Capability::BlockSuspend,
168 Capability::Bpf,
169 Capability::CheckpointRestore,
170 Capability::Chown,
171 Capability::DacOverride,
172 Capability::DacReadSearch,
173 Capability::Fowner,
174 Capability::Fsetid,
175 Capability::IpcLock,
176 Capability::IpcOwner,
177 Capability::Kill,
178 Capability::Lease,
179 Capability::LinuxImmutable,
180 Capability::MacAdmin,
181 Capability::MacOverride,
182 Capability::Mknod,
183 Capability::NetAdmin,
184 Capability::NetBindService,
185 Capability::NetBroadcast,
186 Capability::NetRaw,
187 Capability::Perfmon,
188 Capability::Setfcap,
189 Capability::Setgid,
190 Capability::Setpcap,
191 Capability::Setuid,
192 Capability::SysAdmin,
193 Capability::SysBoot,
194 Capability::SysChroot,
195 Capability::SysModule,
196 Capability::SysNice,
197 Capability::SysPacct,
198 Capability::SysPtrace,
199 Capability::SysRawio,
200 Capability::SysResource,
201 Capability::SysTime,
202 Capability::SysTtyConfig,
203 Capability::Syslog,
204 Capability::WakeAlarm,
205];
206
207#[must_use]
237pub fn generate_resolv_conf(nameservers: &[String]) -> String {
238 let mut out = String::new();
239 for ns in nameservers {
240 out.push_str("nameserver ");
241 out.push_str(ns);
242 out.push('\n');
243 }
244 out.push_str("options edns0\n");
245 out
246}
247
248pub fn parse_memory_string(s: &str) -> std::result::Result<u64, String> {
251 let s = s.trim();
252 if s.is_empty() {
253 return Err("empty memory string".to_string());
254 }
255
256 let (num_str, multiplier) = if let Some(n) = s.strip_suffix("Ki") {
257 (n, 1024u64)
258 } else if let Some(n) = s.strip_suffix("Mi") {
259 (n, 1024u64 * 1024)
260 } else if let Some(n) = s.strip_suffix("Gi") {
261 (n, 1024u64 * 1024 * 1024)
262 } else if let Some(n) = s.strip_suffix("Ti") {
263 (n, 1024u64 * 1024 * 1024 * 1024)
264 } else if let Some(n) = s.strip_suffix('K').or_else(|| s.strip_suffix('k')) {
265 (n, 1000u64)
266 } else if let Some(n) = s.strip_suffix('M').or_else(|| s.strip_suffix('m')) {
267 (n, 1000u64 * 1000)
268 } else if let Some(n) = s.strip_suffix('G').or_else(|| s.strip_suffix('g')) {
269 (n, 1000u64 * 1000 * 1000)
270 } else if let Some(n) = s.strip_suffix('T').or_else(|| s.strip_suffix('t')) {
271 (n, 1000u64 * 1000 * 1000 * 1000)
272 } else {
273 (s, 1u64)
274 };
275
276 let num: u64 = num_str
277 .parse()
278 .map_err(|e| format!("invalid number: {e}"))?;
279
280 Ok(num * multiplier)
281}
282
283#[cfg(unix)]
292#[allow(clippy::cast_possible_wrap)]
293fn get_device_major_minor(path: &str) -> std::io::Result<(i64, i64)> {
294 use std::os::unix::fs::MetadataExt;
295 let metadata = std::fs::metadata(path)?;
296 let rdev = metadata.rdev();
297 let major = ((rdev >> 8) & 0xff) as i64;
299 let minor = (rdev & 0xff) as i64;
300 Ok((major, minor))
301}
302
303#[cfg(not(unix))]
305fn get_device_major_minor(_path: &str) -> std::io::Result<(i64, i64)> {
306 Err(std::io::Error::new(
307 std::io::ErrorKind::Unsupported,
308 "device-cgroup probes require Unix",
309 ))
310}
311
312fn ulimit_name_to_posix(name: &str) -> Option<PosixRlimitType> {
316 Some(match name.to_ascii_lowercase().as_str() {
317 "cpu" => PosixRlimitType::RlimitCpu,
318 "fsize" => PosixRlimitType::RlimitFsize,
319 "data" => PosixRlimitType::RlimitData,
320 "stack" => PosixRlimitType::RlimitStack,
321 "core" => PosixRlimitType::RlimitCore,
322 "rss" => PosixRlimitType::RlimitRss,
323 "nproc" => PosixRlimitType::RlimitNproc,
324 "nofile" => PosixRlimitType::RlimitNofile,
325 "memlock" => PosixRlimitType::RlimitMemlock,
326 "as" => PosixRlimitType::RlimitAs,
327 "locks" => PosixRlimitType::RlimitLocks,
328 "sigpending" => PosixRlimitType::RlimitSigpending,
329 "msgqueue" => PosixRlimitType::RlimitMsgqueue,
330 "nice" => PosixRlimitType::RlimitNice,
331 "rtprio" => PosixRlimitType::RlimitRtprio,
332 "rttime" => PosixRlimitType::RlimitRttime,
333 _ => return None,
334 })
335}
336
337#[cfg(test)]
338mod ulimit_translation_tests {
339 use super::{ulimit_name_to_posix, PosixRlimitType};
340
341 #[test]
342 fn known_names_map() {
343 assert_eq!(
344 ulimit_name_to_posix("nofile"),
345 Some(PosixRlimitType::RlimitNofile)
346 );
347 assert_eq!(
348 ulimit_name_to_posix("NOFILE"),
349 Some(PosixRlimitType::RlimitNofile)
350 );
351 assert_eq!(
352 ulimit_name_to_posix("nproc"),
353 Some(PosixRlimitType::RlimitNproc)
354 );
355 assert_eq!(ulimit_name_to_posix("as"), Some(PosixRlimitType::RlimitAs));
356 }
357
358 #[test]
359 fn unknown_names_return_none() {
360 assert!(ulimit_name_to_posix("not_a_real_ulimit").is_none());
361 assert!(ulimit_name_to_posix("").is_none());
362 }
363}
364
365#[cfg(unix)]
370fn get_device_type(path: &str) -> std::io::Result<LinuxDeviceType> {
371 use std::os::unix::fs::FileTypeExt;
372 let metadata = std::fs::metadata(path)?;
373 let file_type = metadata.file_type();
374 if file_type.is_char_device() {
375 Ok(LinuxDeviceType::C)
376 } else if file_type.is_block_device() {
377 Ok(LinuxDeviceType::B)
378 } else {
379 Ok(LinuxDeviceType::U) }
381}
382
383#[cfg(not(unix))]
385fn get_device_type(_path: &str) -> std::io::Result<LinuxDeviceType> {
386 Err(std::io::Error::new(
387 std::io::ErrorKind::Unsupported,
388 "device-cgroup probes require Unix",
389 ))
390}
391
392#[derive(Clone)]
406pub struct BundleBuilder {
407 bundle_dir: PathBuf,
409 rootfs_path: Option<PathBuf>,
411 hostname: Option<String>,
413 extra_env: Vec<(String, String)>,
415 cwd: Option<String>,
417 args: Option<Vec<String>>,
419 volume_paths: HashMap<String, PathBuf>,
421 image_config: Option<zlayer_registry::ImageConfig>,
423 host_network: bool,
425 secrets_provider: Option<Arc<dyn SecretsProvider>>,
427 deployment_scope: Option<String>,
429 socket_path: Option<String>,
431 cdi_registry: Option<Arc<CdiRegistry>>,
438}
439
440impl std::fmt::Debug for BundleBuilder {
441 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
442 f.debug_struct("BundleBuilder")
443 .field("bundle_dir", &self.bundle_dir)
444 .field("rootfs_path", &self.rootfs_path)
445 .field("hostname", &self.hostname)
446 .field("extra_env", &self.extra_env)
447 .field("cwd", &self.cwd)
448 .field("args", &self.args)
449 .field("volume_paths", &self.volume_paths)
450 .field("image_config", &self.image_config)
451 .field("host_network", &self.host_network)
452 .field("secrets_provider", &self.secrets_provider.is_some())
453 .field("deployment_scope", &self.deployment_scope)
454 .field("socket_path", &self.socket_path)
455 .field("cdi_registry", &self.cdi_registry.is_some())
456 .finish()
457 }
458}
459
460#[cfg(unix)]
468fn build_rootless_id_mappings(
469 host_id: u32,
470 subid_path: &str,
471 username: &str,
472) -> Vec<oci_spec::runtime::LinuxIdMapping> {
473 let mut mappings = vec![LinuxIdMappingBuilder::default()
474 .container_id(0_u32)
475 .host_id(host_id)
476 .size(1_u32)
477 .build()
478 .unwrap()];
479 if !username.is_empty() {
480 if let Some((start, count)) = read_subid_range(subid_path, username) {
481 mappings.push(
482 LinuxIdMappingBuilder::default()
483 .container_id(1_u32)
484 .host_id(start)
485 .size(count)
486 .build()
487 .unwrap(),
488 );
489 }
490 }
491 mappings
492}
493
494#[cfg(unix)]
502fn read_subid_range(path: &str, username: &str) -> Option<(u32, u32)> {
503 let contents = std::fs::read_to_string(path).ok()?;
504 for line in contents.lines() {
505 let mut parts = line.splitn(3, ':');
506 let user = parts.next()?;
507 if user != username {
508 continue;
509 }
510 let start: u32 = parts.next()?.parse().ok()?;
511 let count: u32 = parts.next()?.parse().ok()?;
512 return Some((start, count));
513 }
514 None
515}
516
517impl BundleBuilder {
518 #[must_use]
528 pub fn new(bundle_dir: PathBuf) -> Self {
529 Self {
530 bundle_dir,
531 rootfs_path: None,
532 hostname: None,
533 extra_env: Vec::new(),
534 cwd: None,
535 args: None,
536 volume_paths: HashMap::new(),
537 image_config: None,
538 host_network: false,
539 secrets_provider: None,
540 deployment_scope: None,
541 socket_path: None,
542 cdi_registry: None,
543 }
544 }
545
546 #[must_use]
553 pub fn with_cdi_registry(mut self, registry: Arc<CdiRegistry>) -> Self {
554 self.cdi_registry = Some(registry);
555 self
556 }
557
558 #[must_use]
560 pub fn for_container(container_id: &ContainerId) -> Self {
561 let bundle_dir = zlayer_paths::ZLayerDirs::system_default()
562 .bundles()
563 .join(container_id.to_string());
564 Self::new(bundle_dir)
565 }
566
567 #[must_use]
571 pub fn with_rootfs(mut self, rootfs_path: PathBuf) -> Self {
572 self.rootfs_path = Some(rootfs_path);
573 self
574 }
575
576 #[must_use]
578 pub fn with_hostname(mut self, hostname: String) -> Self {
579 self.hostname = Some(hostname);
580 self
581 }
582
583 #[must_use]
585 pub fn with_env(mut self, key: String, value: String) -> Self {
586 self.extra_env.push((key, value));
587 self
588 }
589
590 #[must_use]
592 pub fn with_cwd(mut self, cwd: String) -> Self {
593 self.cwd = Some(cwd);
594 self
595 }
596
597 #[must_use]
599 pub fn with_args(mut self, args: Vec<String>) -> Self {
600 self.args = Some(args);
601 self
602 }
603
604 #[must_use]
609 pub fn with_volume_paths(mut self, volume_paths: HashMap<String, PathBuf>) -> Self {
610 self.volume_paths = volume_paths;
611 self
612 }
613
614 #[must_use]
619 pub fn with_image_config(mut self, config: zlayer_registry::ImageConfig) -> Self {
620 self.image_config = Some(config);
621 self
622 }
623
624 #[must_use]
630 pub fn with_host_network(mut self, host_network: bool) -> Self {
631 self.host_network = host_network;
632 self
633 }
634
635 #[must_use]
640 pub fn with_secrets_provider(mut self, provider: Arc<dyn SecretsProvider>) -> Self {
641 self.secrets_provider = Some(provider);
642 self
643 }
644
645 #[must_use]
650 pub fn with_deployment_scope(mut self, scope: String) -> Self {
651 self.deployment_scope = Some(scope);
652 self
653 }
654
655 #[must_use]
658 pub fn with_socket_mount(mut self, path: impl Into<String>) -> Self {
659 self.socket_path = Some(path.into());
660 self
661 }
662
663 #[must_use]
665 pub fn bundle_dir(&self) -> &Path {
666 &self.bundle_dir
667 }
668
669 #[cfg(unix)]
688 pub async fn build(&self, container_id: &ContainerId, spec: &ServiceSpec) -> Result<PathBuf> {
689 fs::create_dir_all(&self.bundle_dir)
691 .await
692 .map_err(|e| AgentError::CreateFailed {
693 id: container_id.to_string(),
694 reason: format!("failed to create bundle directory: {e}"),
695 })?;
696
697 let rootfs_in_bundle = self.bundle_dir.join("rootfs");
699 if let Some(ref rootfs_path) = self.rootfs_path {
700 let _ = fs::remove_file(&rootfs_in_bundle).await;
702 let _ = fs::remove_dir(&rootfs_in_bundle).await;
703
704 #[cfg(unix)]
709 tokio::fs::symlink(rootfs_path, &rootfs_in_bundle)
710 .await
711 .map_err(|e| AgentError::CreateFailed {
712 id: container_id.to_string(),
713 reason: format!(
714 "failed to symlink rootfs from {} to {}: {}",
715 rootfs_path.display(),
716 rootfs_in_bundle.display(),
717 e
718 ),
719 })?;
720
721 #[cfg(windows)]
722 tokio::fs::symlink_dir(rootfs_path, &rootfs_in_bundle)
723 .await
724 .map_err(|e| AgentError::CreateFailed {
725 id: container_id.to_string(),
726 reason: format!(
727 "failed to symlink rootfs from {} to {}: {}",
728 rootfs_path.display(),
729 rootfs_in_bundle.display(),
730 e
731 ),
732 })?;
733 } else {
734 fs::create_dir_all(&rootfs_in_bundle)
736 .await
737 .map_err(|e| AgentError::CreateFailed {
738 id: container_id.to_string(),
739 reason: format!("failed to create rootfs directory: {e}"),
740 })?;
741 }
742
743 let oci_spec = self
745 .build_spec_only(container_id, spec, &self.volume_paths)
746 .await?;
747
748 let config_path = self.bundle_dir.join("config.json");
750 let config_json =
751 serde_json::to_string_pretty(&oci_spec).map_err(|e| AgentError::CreateFailed {
752 id: container_id.to_string(),
753 reason: format!("failed to serialize OCI spec: {e}"),
754 })?;
755
756 fs::write(&config_path, config_json)
757 .await
758 .map_err(|e| AgentError::CreateFailed {
759 id: container_id.to_string(),
760 reason: format!("failed to write config.json: {e}"),
761 })?;
762
763 tracing::debug!(
764 "Created OCI bundle at {} for container {}",
765 self.bundle_dir.display(),
766 container_id
767 );
768
769 Ok(self.bundle_dir.clone())
770 }
771
772 pub async fn build_spec_only(
792 &self,
793 container_id: &ContainerId,
794 spec: &ServiceSpec,
795 volume_paths: &std::collections::HashMap<String, PathBuf>,
796 ) -> Result<oci_spec::runtime::Spec> {
797 self.build_oci_spec(container_id, spec, volume_paths).await
798 }
799
800 fn resolve_cdi_edits(&self, spec: &ServiceSpec) -> Result<Option<Vec<CdiContainerEdits>>> {
815 let Some(ref gpu) = spec.resources.gpu else {
816 return Ok(None);
817 };
818
819 let Some(kind) = cdi::vendor_to_cdi_kind(&gpu.vendor) else {
822 return Ok(None);
823 };
824
825 let (registry, strict) = if let Some(reg) = &self.cdi_registry {
831 (reg.clone(), true)
832 } else {
833 let reg = Arc::new(CdiRegistry::discover());
834 if reg.is_empty() {
835 return Ok(None);
836 }
837 (reg, false)
838 };
839
840 let device_names: Vec<String> = (0..gpu.count).map(|i| i.to_string()).collect();
841
842 match registry.resolve_for_kind(kind, &device_names) {
843 Ok(edits) => Ok(Some(edits)),
844 Err(err) => {
845 if strict {
846 Err(AgentError::InvalidSpec(format!(
847 "CDI resolution failed for vendor '{}': {err}",
848 gpu.vendor
849 )))
850 } else {
851 tracing::warn!(
852 vendor = %gpu.vendor,
853 kind = %kind,
854 error = %err,
855 "CDI resolution failed; falling back to baked-in GPU device passthrough"
856 );
857 Ok(None)
858 }
859 }
860 }
861 }
862
863 #[allow(clippy::too_many_lines)]
882 async fn build_oci_spec(
883 &self,
884 container_id: &ContainerId,
885 spec: &ServiceSpec,
886 volume_paths: &std::collections::HashMap<String, PathBuf>,
887 ) -> Result<Spec> {
888 let cdi_edits = self.resolve_cdi_edits(spec)?;
892
893 let user = {
895 let (uid, gid) = if let Some(user_str) = self
896 .image_config
897 .as_ref()
898 .and_then(|c| c.user.as_ref())
899 .filter(|u| !u.is_empty())
900 {
901 let parts: Vec<&str> = user_str.splitn(2, ':').collect();
903 let uid = parts[0].parse::<u32>().unwrap_or(0);
904 let gid = if parts.len() > 1 {
905 parts[1].parse::<u32>().unwrap_or(0)
906 } else {
907 uid
908 };
909 (uid, gid)
910 } else {
911 (0u32, 0u32)
912 };
913
914 UserBuilder::default()
915 .uid(uid)
916 .gid(gid)
917 .build()
918 .map_err(|e| AgentError::InvalidSpec(format!("failed to build user: {e}")))?
919 };
920
921 let mut env: Vec<String> = Vec::new();
924 let mut env_keys: HashSet<String> = HashSet::new();
925
926 if let Some(img_env) = self.image_config.as_ref().and_then(|c| c.env.as_ref()) {
928 for entry in img_env {
929 if let Some(key) = entry.split('=').next() {
930 env_keys.insert(key.to_string());
931 }
932 env.push(entry.clone());
933 }
934 }
935
936 if !env_keys.contains("PATH") {
938 env.push(
939 "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(),
940 );
941 env_keys.insert("PATH".to_string());
942 }
943
944 if !env_keys.contains("TERM") {
946 env.push("TERM=xterm".to_string());
947 env_keys.insert("TERM".to_string());
948 }
949
950 if let (Some(secrets_provider), Some(scope)) =
957 (&self.secrets_provider, &self.deployment_scope)
958 {
959 let resolved_map =
960 crate::env::resolve_env_with_secrets(&spec.env, secrets_provider.as_ref(), scope)
961 .await
962 .map_err(|e| {
963 AgentError::InvalidSpec(format!(
964 "environment variable resolution failed: {e}"
965 ))
966 })?;
967
968 for (key, value) in &resolved_map {
969 if env_keys.contains(key.as_str()) {
970 env.retain(|e| e.split('=').next() != Some(key.as_str()));
971 }
972 env_keys.insert(key.clone());
973 env.push(format!("{key}={value}"));
974 }
975 } else {
976 let resolved = crate::env::resolve_env_vars_with_warnings(&spec.env).map_err(|e| {
977 AgentError::InvalidSpec(format!("environment variable resolution failed: {e}"))
978 })?;
979
980 for warning in &resolved.warnings {
982 tracing::warn!(container = %container_id, "{}", warning);
983 }
984
985 for var in &resolved.vars {
987 if let Some(key) = var.split('=').next() {
988 if env_keys.contains(key) {
989 env.retain(|e| e.split('=').next() != Some(key));
991 }
992 env_keys.insert(key.to_string());
993 }
994 env.push(var.clone());
995 }
996 }
997
998 for (key, value) in &self.extra_env {
1000 if env_keys.contains(key.as_str()) {
1001 env.retain(|e| e.split('=').next() != Some(key.as_str()));
1002 }
1003 env_keys.insert(key.clone());
1004 env.push(format!("{key}={value}"));
1005 }
1006
1007 if let Some(ref edits_per_device) = cdi_edits {
1016 for edits in edits_per_device {
1017 for entry in &edits.env {
1018 if let Some(key) = entry.split('=').next() {
1019 if env_keys.contains(key) {
1020 env.retain(|e| e.split('=').next() != Some(key));
1021 }
1022 env_keys.insert(key.to_string());
1023 }
1024 env.push(entry.clone());
1025 }
1026 }
1027 } else if let Some(ref gpu) = spec.resources.gpu {
1028 let indices: Vec<String> = (0..gpu.count).map(|i| i.to_string()).collect();
1030 let device_list = indices.join(",");
1031 match gpu.vendor.as_str() {
1032 "nvidia" => {
1033 env.push(format!("NVIDIA_VISIBLE_DEVICES={device_list}"));
1034 env.push(format!("CUDA_VISIBLE_DEVICES={device_list}"));
1035 }
1036 "amd" => {
1037 env.push(format!("ROCR_VISIBLE_DEVICES={device_list}"));
1038 env.push(format!("HIP_VISIBLE_DEVICES={device_list}"));
1039 }
1040 "intel" => {
1041 env.push(format!("ZE_AFFINITY_MASK={device_list}"));
1042 }
1043 _ => {}
1044 }
1045 }
1046
1047 let mps_dirs = if let Some(ref gpu) = spec.resources.gpu {
1060 resolve_mps_dirs(gpu)?
1061 } else {
1062 None
1063 };
1064 if let Some(ref dirs) = mps_dirs {
1065 let pipe = format!("CUDA_MPS_PIPE_DIRECTORY={}", dirs.pipe_dir.display());
1066 let log = format!("CUDA_MPS_LOG_DIRECTORY={}", dirs.log_dir.display());
1067 if env_keys.contains("CUDA_MPS_PIPE_DIRECTORY") {
1068 env.retain(|e| e.split('=').next() != Some("CUDA_MPS_PIPE_DIRECTORY"));
1069 }
1070 if env_keys.contains("CUDA_MPS_LOG_DIRECTORY") {
1071 env.retain(|e| e.split('=').next() != Some("CUDA_MPS_LOG_DIRECTORY"));
1072 }
1073 env_keys.insert("CUDA_MPS_PIPE_DIRECTORY".to_string());
1074 env_keys.insert("CUDA_MPS_LOG_DIRECTORY".to_string());
1075 env.push(pipe);
1076 env.push(log);
1077 }
1078 if let Some(ref gpu) = spec.resources.gpu {
1079 if gpu.sharing == Some(GpuSharingMode::TimeSlice) {
1080 if let Some(idx) = gpu.time_slice_index {
1081 env.retain(|e| e.split('=').next() != Some("CUDA_VISIBLE_DEVICES"));
1086 env_keys.insert("CUDA_VISIBLE_DEVICES".to_string());
1087 env.push(format!("CUDA_VISIBLE_DEVICES={idx}"));
1088 }
1089 }
1090 }
1091
1092 if let Some(ref gpu) = spec.resources.gpu {
1096 if let Some(ref dist) = gpu.distributed {
1097 env.push(format!("MASTER_PORT={}", dist.master_port));
1098 env.push(format!("MASTER_ADDR={}", container_id.service));
1099 env.push("WORLD_SIZE=1".to_string());
1100 env.push("RANK=0".to_string());
1101 env.push("LOCAL_RANK=0".to_string());
1102 match dist.backend.as_str() {
1103 "nccl" => env.push("NCCL_SOCKET_IFNAME=eth0".to_string()),
1104 "gloo" => env.push("GLOO_SOCKET_IFNAME=eth0".to_string()),
1105 _ => {}
1106 }
1107 }
1108 }
1109
1110 let capabilities = self.build_capabilities(spec)?;
1112
1113 let cwd = self
1115 .cwd
1116 .clone()
1117 .or_else(|| spec.command.workdir.clone())
1118 .or_else(|| {
1119 self.image_config
1120 .as_ref()
1121 .and_then(|c| c.working_dir.as_ref())
1122 .filter(|w| !w.is_empty())
1123 .cloned()
1124 })
1125 .unwrap_or_else(|| "/".to_string());
1126
1127 let process_args = if let Some(ref args) = self.args {
1129 args.clone()
1130 } else {
1131 Self::resolve_command_from_spec(spec, self.image_config.as_ref())
1132 };
1133
1134 let mut process_builder = ProcessBuilder::default()
1136 .terminal(false)
1137 .user(user)
1138 .env(env)
1139 .args(process_args)
1140 .cwd(cwd)
1141 .no_new_privileges(!spec.privileged && spec.capabilities.is_empty());
1142
1143 if let Some(caps) = capabilities {
1145 process_builder = process_builder.capabilities(caps);
1146 }
1147
1148 let mut rlimits: Vec<PosixRlimit> = Vec::with_capacity(spec.ulimits.len());
1154 for (name, limit) in &spec.ulimits {
1155 let typ = ulimit_name_to_posix(name).ok_or_else(|| {
1156 AgentError::InvalidSpec(format!(
1157 "unknown ulimit name `{name}` (expected one of: cpu, fsize, data, stack, \
1158 core, rss, nproc, nofile, memlock, as, locks, sigpending, msgqueue, nice, \
1159 rtprio, rttime)"
1160 ))
1161 })?;
1162 let entry = PosixRlimitBuilder::default()
1163 .typ(typ)
1164 .soft(u64::try_from(limit.soft.max(0)).unwrap_or(0))
1165 .hard(u64::try_from(limit.hard.max(0)).unwrap_or(0))
1166 .build()
1167 .map_err(|e| {
1168 AgentError::InvalidSpec(format!("failed to build rlimit `{name}`: {e}"))
1169 })?;
1170 rlimits.push(entry);
1171 }
1172 if !rlimits.is_empty() {
1173 process_builder = process_builder.rlimits(rlimits);
1174 }
1175
1176 let process = process_builder
1177 .build()
1178 .map_err(|e| AgentError::InvalidSpec(format!("failed to build process: {e}")))?;
1179
1180 let root = RootBuilder::default()
1183 .path("rootfs".to_string())
1184 .readonly(false)
1185 .build()
1186 .map_err(|e| AgentError::InvalidSpec(format!("failed to build root: {e}")))?;
1187
1188 let mut mounts = self.build_default_mounts(spec)?;
1190
1191 let storage_mounts = self.build_storage_mounts(spec, volume_paths)?;
1193 mounts.extend(storage_mounts);
1194
1195 if let Some(ref socket_path) = self.socket_path {
1199 mounts.push(
1200 MountBuilder::default()
1201 .destination(zlayer_paths::ZLayerDirs::default_socket_path())
1202 .typ("bind")
1203 .source(socket_path.clone())
1204 .options(vec!["rbind".into(), "ro".into()])
1205 .build()
1206 .expect("valid socket mount"),
1207 );
1208 }
1209
1210 if !spec.host_network && !spec.dns.is_empty() && self.bundle_dir.exists() {
1231 let resolv_path = self.bundle_dir.join("resolv.conf");
1232 let contents = generate_resolv_conf(&spec.dns);
1233 fs::write(&resolv_path, contents).await.map_err(|e| {
1234 AgentError::InvalidSpec(format!(
1235 "failed to write resolv.conf to bundle at {}: {e}",
1236 resolv_path.display()
1237 ))
1238 })?;
1239 mounts.push(
1240 MountBuilder::default()
1241 .destination("/etc/resolv.conf".to_string())
1242 .typ("bind")
1243 .source(resolv_path.to_string_lossy().to_string())
1244 .options(vec!["rbind".to_string(), "ro".to_string()])
1245 .build()
1246 .map_err(|e| {
1247 AgentError::InvalidSpec(format!("failed to build resolv.conf mount: {e}"))
1248 })?,
1249 );
1250 }
1251
1252 if let Some(ref edits_per_device) = cdi_edits {
1255 for edits in edits_per_device {
1256 for cdi_mount in &edits.mounts {
1257 let mut opts = cdi_mount.options.clone();
1258 if !opts.iter().any(|o| o == "bind" || o == "rbind") {
1259 opts.push("rbind".to_string());
1260 }
1261 mounts.push(
1262 MountBuilder::default()
1263 .destination(cdi_mount.container_path.clone())
1264 .typ("bind")
1265 .source(cdi_mount.host_path.clone())
1266 .options(opts)
1267 .build()
1268 .map_err(|e| {
1269 AgentError::InvalidSpec(format!("failed to build CDI mount: {e}"))
1270 })?,
1271 );
1272 }
1273 }
1274 }
1275
1276 if let Some(ref dirs) = mps_dirs {
1288 mounts.push(
1289 MountBuilder::default()
1290 .destination(dirs.pipe_dir.clone())
1291 .typ("bind")
1292 .source(dirs.pipe_dir.clone())
1293 .options(vec!["rbind".into(), "rw".into()])
1294 .build()
1295 .map_err(|e| {
1296 AgentError::InvalidSpec(format!("failed to build MPS pipe mount: {e}"))
1297 })?,
1298 );
1299 mounts.push(
1300 MountBuilder::default()
1301 .destination(dirs.log_dir.clone())
1302 .typ("bind")
1303 .source(dirs.log_dir.clone())
1304 .options(vec!["rbind".into(), "rw".into()])
1305 .build()
1306 .map_err(|e| {
1307 AgentError::InvalidSpec(format!("failed to build MPS log mount: {e}"))
1308 })?,
1309 );
1310 }
1311 if let Some(ref gpu) = spec.resources.gpu {
1312 if gpu.sharing == Some(GpuSharingMode::TimeSlice) {
1313 if let Some(ref cfg_path) = gpu.time_slicing_config_path {
1314 let host = PathBuf::from(cfg_path);
1315 if !host.is_file() {
1316 return Err(AgentError::GpuSharingUnavailable {
1317 mode: "time-slice".to_string(),
1318 reason: format!(
1319 "time-slicing config {} is not a regular file on the host",
1320 host.display()
1321 ),
1322 });
1323 }
1324 mounts.push(
1325 MountBuilder::default()
1326 .destination(PathBuf::from(TIMESLICE_CONFIG_CONTAINER_PATH))
1327 .typ("bind")
1328 .source(host)
1329 .options(vec!["rbind".into(), "ro".into()])
1330 .build()
1331 .map_err(|e| {
1332 AgentError::InvalidSpec(format!(
1333 "failed to build time-slicing config mount: {e}"
1334 ))
1335 })?,
1336 );
1337 }
1338 }
1339 }
1340
1341 let linux = self.build_linux_config(container_id, spec, cdi_edits.as_deref())?;
1343
1344 let hostname = self
1346 .hostname
1347 .clone()
1348 .unwrap_or_else(|| container_id.to_string());
1349
1350 let mut spec_builder = SpecBuilder::default()
1352 .version("1.0.2".to_string())
1353 .root(root)
1354 .process(process)
1355 .hostname(hostname)
1356 .mounts(mounts)
1357 .linux(linux);
1358
1359 if let Some(ref edits_per_device) = cdi_edits {
1360 if let Some(hooks) = Self::build_hooks_from_cdi(edits_per_device)? {
1361 spec_builder = spec_builder.hooks(hooks);
1362 }
1363 }
1364
1365 let oci_spec = spec_builder
1366 .build()
1367 .map_err(|e| AgentError::InvalidSpec(format!("failed to build OCI spec: {e}")))?;
1368
1369 Ok(oci_spec)
1370 }
1371
1372 fn build_hooks_from_cdi(edits_per_device: &[CdiContainerEdits]) -> Result<Option<Hooks>> {
1379 let mut prestart: Vec<Hook> = Vec::new();
1380 let mut create_runtime: Vec<Hook> = Vec::new();
1381 let mut create_container: Vec<Hook> = Vec::new();
1382 let mut start_container: Vec<Hook> = Vec::new();
1383 let mut poststart: Vec<Hook> = Vec::new();
1384 let mut poststop: Vec<Hook> = Vec::new();
1385
1386 for edits in edits_per_device {
1387 let Some(ref h) = edits.hooks else { continue };
1388 for hook in &h.prestart {
1389 prestart.push(convert_cdi_hook(hook)?);
1390 }
1391 for hook in &h.create_runtime {
1392 create_runtime.push(convert_cdi_hook(hook)?);
1393 }
1394 for hook in &h.create_container {
1395 create_container.push(convert_cdi_hook(hook)?);
1396 }
1397 for hook in &h.start_container {
1398 start_container.push(convert_cdi_hook(hook)?);
1399 }
1400 for hook in &h.poststart {
1401 poststart.push(convert_cdi_hook(hook)?);
1402 }
1403 for hook in &h.poststop {
1404 poststop.push(convert_cdi_hook(hook)?);
1405 }
1406 }
1407
1408 if prestart.is_empty()
1409 && create_runtime.is_empty()
1410 && create_container.is_empty()
1411 && start_container.is_empty()
1412 && poststart.is_empty()
1413 && poststop.is_empty()
1414 {
1415 return Ok(None);
1416 }
1417
1418 let mut builder = HooksBuilder::default();
1419 if !prestart.is_empty() {
1420 #[allow(deprecated)]
1421 {
1422 builder = builder.prestart(prestart);
1423 }
1424 }
1425 if !create_runtime.is_empty() {
1426 builder = builder.create_runtime(create_runtime);
1427 }
1428 if !create_container.is_empty() {
1429 builder = builder.create_container(create_container);
1430 }
1431 if !start_container.is_empty() {
1432 builder = builder.start_container(start_container);
1433 }
1434 if !poststart.is_empty() {
1435 builder = builder.poststart(poststart);
1436 }
1437 if !poststop.is_empty() {
1438 builder = builder.poststop(poststop);
1439 }
1440
1441 let hooks = builder
1442 .build()
1443 .map_err(|e| AgentError::InvalidSpec(format!("failed to build CDI hooks: {e}")))?;
1444 Ok(Some(hooks))
1445 }
1446
1447 #[allow(clippy::unused_self)]
1449 fn build_capabilities(
1450 &self,
1451 spec: &ServiceSpec,
1452 ) -> Result<Option<oci_spec::runtime::LinuxCapabilities>> {
1453 if spec.privileged {
1454 let all_caps: HashSet<Capability> = ALL_CAPABILITIES.iter().copied().collect();
1456 let empty_caps: HashSet<Capability> = HashSet::new();
1457
1458 let caps = LinuxCapabilitiesBuilder::default()
1459 .bounding(all_caps.clone())
1460 .effective(all_caps.clone())
1461 .permitted(all_caps)
1462 .inheritable(empty_caps.clone())
1463 .ambient(empty_caps)
1464 .build()
1465 .map_err(|e| {
1466 AgentError::InvalidSpec(format!("failed to build capabilities: {e}"))
1467 })?;
1468
1469 Ok(Some(caps))
1470 } else if !spec.capabilities.is_empty() {
1471 let caps: HashSet<Capability> = spec
1473 .capabilities
1474 .iter()
1475 .filter_map(|c| {
1476 let cap_name = if c.starts_with("CAP_") {
1478 c.to_uppercase()
1479 } else {
1480 format!("CAP_{}", c.to_uppercase())
1481 };
1482 Capability::from_str(&cap_name).ok()
1483 })
1484 .collect();
1485
1486 let empty_caps: HashSet<Capability> = HashSet::new();
1487
1488 let built_caps = LinuxCapabilitiesBuilder::default()
1489 .bounding(caps.clone())
1490 .effective(caps.clone())
1491 .permitted(caps)
1492 .inheritable(empty_caps.clone())
1493 .ambient(empty_caps)
1494 .build()
1495 .map_err(|e| {
1496 AgentError::InvalidSpec(format!("failed to build capabilities: {e}"))
1497 })?;
1498
1499 Ok(Some(built_caps))
1500 } else {
1501 let default_caps: HashSet<Capability> = [
1503 Capability::Chown,
1504 Capability::DacOverride,
1505 Capability::Fsetid,
1506 Capability::Fowner,
1507 Capability::Mknod,
1508 Capability::NetRaw,
1509 Capability::Setgid,
1510 Capability::Setuid,
1511 Capability::Setfcap,
1512 Capability::Setpcap,
1513 Capability::NetBindService,
1514 Capability::SysChroot,
1515 Capability::Kill,
1516 Capability::AuditWrite,
1517 ]
1518 .into_iter()
1519 .collect();
1520
1521 let empty_caps: HashSet<Capability> = HashSet::new();
1522
1523 let built_caps = LinuxCapabilitiesBuilder::default()
1524 .bounding(default_caps.clone())
1525 .effective(default_caps.clone())
1526 .permitted(default_caps)
1527 .inheritable(empty_caps.clone())
1528 .ambient(empty_caps)
1529 .build()
1530 .map_err(|e| {
1531 AgentError::InvalidSpec(format!("failed to build capabilities: {e}"))
1532 })?;
1533
1534 Ok(Some(built_caps))
1535 }
1536 }
1537
1538 #[allow(clippy::unused_self, clippy::too_many_lines)]
1540 fn build_default_mounts(&self, spec: &ServiceSpec) -> Result<Vec<Mount>> {
1541 let mut mounts = Vec::new();
1542
1543 mounts.push(
1545 MountBuilder::default()
1546 .destination("/proc".to_string())
1547 .typ("proc".to_string())
1548 .source("proc".to_string())
1549 .options(vec![
1550 "nosuid".to_string(),
1551 "noexec".to_string(),
1552 "nodev".to_string(),
1553 ])
1554 .build()
1555 .map_err(|e| {
1556 AgentError::InvalidSpec(format!("failed to build /proc mount: {e}"))
1557 })?,
1558 );
1559
1560 mounts.push(
1562 MountBuilder::default()
1563 .destination("/dev".to_string())
1564 .typ("tmpfs".to_string())
1565 .source("tmpfs".to_string())
1566 .options(vec![
1567 "nosuid".to_string(),
1568 "strictatime".to_string(),
1569 "mode=755".to_string(),
1570 "size=65536k".to_string(),
1571 ])
1572 .build()
1573 .map_err(|e| AgentError::InvalidSpec(format!("failed to build /dev mount: {e}")))?,
1574 );
1575
1576 mounts.push(
1578 MountBuilder::default()
1579 .destination("/dev/pts".to_string())
1580 .typ("devpts".to_string())
1581 .source("devpts".to_string())
1582 .options(vec![
1583 "nosuid".to_string(),
1584 "noexec".to_string(),
1585 "newinstance".to_string(),
1586 "ptmxmode=0666".to_string(),
1587 "mode=0620".to_string(),
1588 "gid=5".to_string(),
1589 ])
1590 .build()
1591 .map_err(|e| {
1592 AgentError::InvalidSpec(format!("failed to build /dev/pts mount: {e}"))
1593 })?,
1594 );
1595
1596 mounts.push(
1598 MountBuilder::default()
1599 .destination("/dev/shm".to_string())
1600 .typ("tmpfs".to_string())
1601 .source("shm".to_string())
1602 .options(vec![
1603 "nosuid".to_string(),
1604 "noexec".to_string(),
1605 "nodev".to_string(),
1606 "mode=1777".to_string(),
1607 "size=65536k".to_string(),
1608 ])
1609 .build()
1610 .map_err(|e| {
1611 AgentError::InvalidSpec(format!("failed to build /dev/shm mount: {e}"))
1612 })?,
1613 );
1614
1615 mounts.push(
1617 MountBuilder::default()
1618 .destination("/dev/mqueue".to_string())
1619 .typ("mqueue".to_string())
1620 .source("mqueue".to_string())
1621 .options(vec![
1622 "nosuid".to_string(),
1623 "noexec".to_string(),
1624 "nodev".to_string(),
1625 ])
1626 .build()
1627 .map_err(|e| {
1628 AgentError::InvalidSpec(format!("failed to build /dev/mqueue mount: {e}"))
1629 })?,
1630 );
1631
1632 let sys_options = if spec.privileged {
1634 vec![
1635 "nosuid".to_string(),
1636 "noexec".to_string(),
1637 "nodev".to_string(),
1638 ]
1639 } else {
1640 vec![
1641 "nosuid".to_string(),
1642 "noexec".to_string(),
1643 "nodev".to_string(),
1644 "ro".to_string(),
1645 ]
1646 };
1647
1648 mounts.push(
1649 MountBuilder::default()
1650 .destination("/sys".to_string())
1651 .typ("sysfs".to_string())
1652 .source("sysfs".to_string())
1653 .options(sys_options)
1654 .build()
1655 .map_err(|e| AgentError::InvalidSpec(format!("failed to build /sys mount: {e}")))?,
1656 );
1657
1658 mounts.push(
1660 MountBuilder::default()
1661 .destination("/sys/fs/cgroup".to_string())
1662 .typ("cgroup2".to_string())
1663 .source("cgroup".to_string())
1664 .options(vec![
1665 "nosuid".to_string(),
1666 "noexec".to_string(),
1667 "nodev".to_string(),
1668 "relatime".to_string(),
1669 ])
1670 .build()
1671 .map_err(|e| {
1672 AgentError::InvalidSpec(format!("failed to build cgroup mount: {e}"))
1673 })?,
1674 );
1675
1676 Ok(mounts)
1677 }
1678
1679 #[allow(clippy::unused_self, clippy::too_many_lines)]
1685 fn build_storage_mounts(
1686 &self,
1687 spec: &ServiceSpec,
1688 volume_paths: &std::collections::HashMap<String, PathBuf>,
1689 ) -> Result<Vec<Mount>> {
1690 let mut mounts = Vec::new();
1691
1692 for storage in &spec.storage {
1693 let mount = match storage {
1694 StorageSpec::Bind {
1695 source,
1696 target,
1697 readonly,
1698 } => {
1699 let mut options = vec!["rbind".to_string()];
1700 if *readonly {
1701 options.push("ro".to_string());
1702 } else {
1703 options.push("rw".to_string());
1704 }
1705
1706 MountBuilder::default()
1707 .destination(target.clone())
1708 .typ("none".to_string())
1709 .source(source.clone())
1710 .options(options)
1711 .build()
1712 .map_err(|e| {
1713 AgentError::InvalidSpec(format!(
1714 "failed to build bind mount for {target}: {e}"
1715 ))
1716 })?
1717 }
1718
1719 StorageSpec::Named {
1720 name,
1721 target,
1722 readonly,
1723 tier,
1724 ..
1725 } => {
1726 let source = volume_paths.get(name).ok_or_else(|| {
1728 AgentError::InvalidSpec(format!(
1729 "volume '{name}' not prepared - ensure StorageManager.ensure_volume() was called"
1730 ))
1731 })?;
1732
1733 if matches!(tier, StorageTier::Network) {
1735 tracing::warn!(
1736 volume = %name,
1737 tier = ?tier,
1738 "Network storage tier is NOT SQLite-safe. Avoid using SQLite databases on this volume."
1739 );
1740 }
1741
1742 let mut options = vec!["rbind".to_string()];
1743 if *readonly {
1744 options.push("ro".to_string());
1745 } else {
1746 options.push("rw".to_string());
1747 }
1748
1749 MountBuilder::default()
1750 .destination(target.clone())
1751 .typ("none".to_string())
1752 .source(source.to_string_lossy().to_string())
1753 .options(options)
1754 .build()
1755 .map_err(|e| {
1756 AgentError::InvalidSpec(format!(
1757 "failed to build named volume mount for {target}: {e}"
1758 ))
1759 })?
1760 }
1761
1762 StorageSpec::Anonymous { target, tier } => {
1763 let key = format!("_anon_{}", target.trim_start_matches('/').replace('/', "_"));
1766 let source = volume_paths.get(&key).ok_or_else(|| {
1767 AgentError::InvalidSpec(format!(
1768 "anonymous volume for '{target}' not prepared"
1769 ))
1770 })?;
1771
1772 if matches!(tier, StorageTier::Network) {
1773 tracing::warn!(
1774 target = %target,
1775 tier = ?tier,
1776 "Network storage tier is NOT SQLite-safe."
1777 );
1778 }
1779
1780 let options = vec!["rbind".to_string(), "rw".to_string()];
1781
1782 MountBuilder::default()
1783 .destination(target.clone())
1784 .typ("none".to_string())
1785 .source(source.to_string_lossy().to_string())
1786 .options(options)
1787 .build()
1788 .map_err(|e| {
1789 AgentError::InvalidSpec(format!(
1790 "failed to build anonymous volume mount for {target}: {e}"
1791 ))
1792 })?
1793 }
1794
1795 StorageSpec::Tmpfs { target, size, mode } => {
1796 let mut options = vec!["nosuid".to_string(), "nodev".to_string()];
1797
1798 if let Some(size_str) = size {
1799 options.push(format!("size={size_str}"));
1800 }
1801
1802 if let Some(mode_val) = mode {
1803 options.push(format!("mode={mode_val:o}"));
1804 }
1805
1806 MountBuilder::default()
1807 .destination(target.clone())
1808 .typ("tmpfs".to_string())
1809 .source("tmpfs".to_string())
1810 .options(options)
1811 .build()
1812 .map_err(|e| {
1813 AgentError::InvalidSpec(format!(
1814 "failed to build tmpfs mount for {target}: {e}"
1815 ))
1816 })?
1817 }
1818
1819 StorageSpec::S3 {
1820 bucket,
1821 prefix,
1822 target,
1823 readonly,
1824 endpoint: _,
1825 credentials: _,
1826 } => {
1827 let key = format!("_s3_{}_{}", bucket, prefix.as_deref().unwrap_or(""));
1830 let source = volume_paths.get(&key).ok_or_else(|| {
1831 AgentError::InvalidSpec(format!(
1832 "S3 volume for bucket '{bucket}' not mounted - ensure StorageManager.mount_s3() was called"
1833 ))
1834 })?;
1835
1836 tracing::warn!(
1837 bucket = %bucket,
1838 target = %target,
1839 "S3 storage is NOT SQLite-safe. Use for read-heavy workloads only."
1840 );
1841
1842 let mut options = vec!["rbind".to_string()];
1843 if *readonly {
1844 options.push("ro".to_string());
1845 } else {
1846 options.push("rw".to_string());
1847 }
1848
1849 MountBuilder::default()
1850 .destination(target.clone())
1851 .typ("none".to_string())
1852 .source(source.to_string_lossy().to_string())
1853 .options(options)
1854 .build()
1855 .map_err(|e| {
1856 AgentError::InvalidSpec(format!(
1857 "failed to build S3 mount for {target}: {e}"
1858 ))
1859 })?
1860 }
1861 };
1862
1863 mounts.push(mount);
1864 }
1865
1866 Ok(mounts)
1867 }
1868
1869 #[allow(clippy::similar_names)] #[allow(clippy::too_many_lines)]
1872 fn build_linux_config(
1873 &self,
1874 container_id: &ContainerId,
1875 spec: &ServiceSpec,
1876 cdi_edits: Option<&[CdiContainerEdits]>,
1877 ) -> Result<oci_spec::runtime::Linux> {
1878 let mut namespaces = vec![
1880 LinuxNamespaceBuilder::default()
1881 .typ(LinuxNamespaceType::Pid)
1882 .build()
1883 .unwrap(),
1884 LinuxNamespaceBuilder::default()
1885 .typ(LinuxNamespaceType::Ipc)
1886 .build()
1887 .unwrap(),
1888 LinuxNamespaceBuilder::default()
1889 .typ(LinuxNamespaceType::Uts)
1890 .build()
1891 .unwrap(),
1892 LinuxNamespaceBuilder::default()
1893 .typ(LinuxNamespaceType::Mount)
1894 .build()
1895 .unwrap(),
1896 ];
1897
1898 if !self.host_network {
1902 namespaces.push(
1903 LinuxNamespaceBuilder::default()
1904 .typ(LinuxNamespaceType::Network)
1905 .build()
1906 .unwrap(),
1907 );
1908 }
1909
1910 #[cfg(unix)]
1915 let rootless = !nix::unistd::geteuid().is_root();
1916 #[cfg(not(unix))]
1917 let rootless = false;
1918
1919 if rootless {
1920 namespaces.push(
1921 LinuxNamespaceBuilder::default()
1922 .typ(LinuxNamespaceType::User)
1923 .build()
1924 .unwrap(),
1925 );
1926 namespaces.push(
1927 LinuxNamespaceBuilder::default()
1928 .typ(LinuxNamespaceType::Cgroup)
1929 .build()
1930 .unwrap(),
1931 );
1932 }
1933
1934 let mut linux_builder = LinuxBuilder::default().namespaces(namespaces);
1935
1936 #[cfg(unix)]
1937 if rootless {
1938 let euid = nix::unistd::geteuid();
1939 let egid = nix::unistd::getegid();
1940 let username = nix::unistd::User::from_uid(euid)
1941 .ok()
1942 .flatten()
1943 .map(|u| u.name)
1944 .unwrap_or_default();
1945 linux_builder = linux_builder
1946 .uid_mappings(build_rootless_id_mappings(
1947 euid.as_raw(),
1948 "/etc/subuid",
1949 &username,
1950 ))
1951 .gid_mappings(build_rootless_id_mappings(
1952 egid.as_raw(),
1953 "/etc/subgid",
1954 &username,
1955 ));
1956 }
1957
1958 let resources = self.build_resources(spec)?;
1960 if let Some(resources) = resources {
1961 linux_builder = linux_builder.resources(resources);
1962 }
1963
1964 let mut devices = self.build_devices(spec, None, cdi_edits.is_some())?;
1971 if let Some(edits_per_device) = cdi_edits {
1972 for edits in edits_per_device {
1973 for node in &edits.device_nodes {
1974 devices.push(cdi_node_to_oci_device(node)?);
1975 }
1976 }
1977 }
1978 if !devices.is_empty() {
1979 linux_builder = linux_builder.devices(devices);
1980 }
1981
1982 linux_builder = linux_builder.rootfs_propagation("private".to_string());
1984
1985 if spec.privileged {
1987 linux_builder = linux_builder.masked_paths(vec![]).readonly_paths(vec![]);
1989 } else {
1990 let masked_paths = vec![
1992 "/proc/acpi".to_string(),
1993 "/proc/asound".to_string(),
1994 "/proc/kcore".to_string(),
1995 "/proc/keys".to_string(),
1996 "/proc/latency_stats".to_string(),
1997 "/proc/timer_list".to_string(),
1998 "/proc/timer_stats".to_string(),
1999 "/proc/sched_debug".to_string(),
2000 "/proc/scsi".to_string(),
2001 "/sys/firmware".to_string(),
2002 ];
2003
2004 let readonly_paths = vec![
2006 "/proc/bus".to_string(),
2007 "/proc/fs".to_string(),
2008 "/proc/irq".to_string(),
2009 "/proc/sys".to_string(),
2010 "/proc/sysrq-trigger".to_string(),
2011 ];
2012
2013 linux_builder = linux_builder
2014 .masked_paths(masked_paths)
2015 .readonly_paths(readonly_paths);
2016 }
2017
2018 let cid = container_id.to_string();
2030
2031 let explicit_parent: Option<(String, &'static str)> =
2035 if let Some(p) = spec.cgroup_parent.as_deref().filter(|s| !s.is_empty()) {
2036 Some((p.to_string(), "spec"))
2037 } else if let Some(p) = std::env::var("ZLAYER_CGROUP_PARENT")
2038 .ok()
2039 .filter(|s| !s.is_empty())
2040 {
2041 Some((p, "env"))
2042 } else {
2043 None
2044 };
2045
2046 #[cfg(target_os = "linux")]
2052 let auto_parent: Option<(String, &'static str)> =
2053 if let Some(p) = crate::capability::ensure_daemon_leaf_and_container_parent() {
2054 Some((p, "auto-init"))
2055 } else if let Some(p) = crate::capability::current_cgroup_v2_path() {
2056 Some((p, "auto"))
2059 } else {
2060 None
2061 };
2062 #[cfg(not(target_os = "linux"))]
2063 let auto_parent: Option<(String, &'static str)> = None;
2064
2065 let (cgroup_parent_value, cgroup_parent_source): (Option<String>, &'static str) =
2066 explicit_parent
2067 .or(auto_parent)
2068 .map_or((None, "none"), |(p, s)| (Some(p), s));
2069
2070 #[cfg(target_os = "linux")]
2077 if cgroup_parent_value.is_none() && crate::capability::DaemonCapabilities::get().is_nested {
2078 tracing::warn!(
2079 container_id = %cid,
2080 "capability survey reports nested daemon but cgroup_parent could not be resolved — proceeding with v2 root"
2081 );
2082 }
2083
2084 if let Some(parent) = cgroup_parent_value {
2085 let parent = parent.trim_end_matches('/');
2086 let full = format!("{parent}/{cid}");
2087 match cgroup_parent_source {
2088 "spec" => tracing::info!(
2089 container_id = %cid,
2090 source = "spec",
2091 path = %full,
2092 "cgroup_parent selected"
2093 ),
2094 "env" => tracing::info!(
2095 container_id = %cid,
2096 source = "env",
2097 path = %full,
2098 "cgroup_parent selected"
2099 ),
2100 "auto" => tracing::info!(
2101 container_id = %cid,
2102 source = "auto",
2103 path = %full,
2104 "cgroup_parent selected (from /proc/self/cgroup)"
2105 ),
2106 "auto-init" => tracing::info!(
2107 container_id = %cid,
2108 source = "auto-init",
2109 path = %full,
2110 "cgroup_parent selected (migrated daemon to <scope>/init; containers go under <scope>/containers)"
2111 ),
2112 _ => unreachable!(),
2113 }
2114 linux_builder = linux_builder.cgroups_path(std::path::PathBuf::from(full));
2115 } else {
2116 #[cfg(target_os = "linux")]
2125 {
2126 let caps = crate::capability::DaemonCapabilities::get();
2127 if !caps.can_write_cgroup_root {
2128 return Err(AgentError::InvalidSpec(format!(
2129 "cannot create container {cid}: no writable cgroup parent. \
2130 /proc/self/cgroup reports the cgroup-v2 root, and \
2131 /sys/fs/cgroup is read-only to this process. Fix one of: \
2132 (a) run the daemon's outer container with --cgroupns=host \
2133 so /proc/self/cgroup reports a real parent; \
2134 (b) set ZLAYER_CGROUP_PARENT=/path/to/writable/cgroup; \
2135 (c) grant the daemon write access to /sys/fs/cgroup."
2136 )));
2137 }
2138 tracing::info!(
2139 container_id = %cid,
2140 "cgroup_parent unset — libcontainer will use v2 root (cgroup root is writable here)"
2141 );
2142 }
2143 #[cfg(not(target_os = "linux"))]
2144 tracing::debug!(
2145 container_id = %cid,
2146 "non-Linux host — cgroup_parent unset; libcontainer inside the WSL distro will resolve a parent from its cgroup-v2 root"
2147 );
2148 }
2149
2150 linux_builder
2151 .build()
2152 .map_err(|e| AgentError::InvalidSpec(format!("failed to build linux config: {e}")))
2153 }
2154
2155 #[allow(clippy::cast_possible_truncation, clippy::cast_possible_wrap)]
2157 fn build_resources(
2158 &self,
2159 spec: &ServiceSpec,
2160 ) -> Result<Option<oci_spec::runtime::LinuxResources>> {
2161 let mut resources_builder = LinuxResourcesBuilder::default();
2162 let mut has_resources = false;
2163
2164 if let Some(cpu_limit) = spec.resources.cpu {
2166 let quota = (cpu_limit * 100_000.0) as i64;
2169 let cpu = LinuxCpuBuilder::default()
2170 .quota(quota)
2171 .period(100_000u64)
2172 .build()
2173 .map_err(|e| AgentError::InvalidSpec(format!("failed to build CPU limits: {e}")))?;
2174
2175 resources_builder = resources_builder.cpu(cpu);
2176 has_resources = true;
2177 }
2178
2179 if let Some(ref memory_str) = spec.resources.memory {
2181 let bytes = parse_memory_string(memory_str)
2182 .map_err(|e| AgentError::InvalidSpec(format!("invalid memory limit: {e}")))?;
2183
2184 let memory = LinuxMemoryBuilder::default()
2185 .limit(bytes as i64)
2186 .build()
2187 .map_err(|e| {
2188 AgentError::InvalidSpec(format!("failed to build memory limits: {e}"))
2189 })?;
2190
2191 resources_builder = resources_builder.memory(memory);
2192 has_resources = true;
2193 }
2194
2195 let device_rules = self.build_device_cgroup_rules(spec, None)?;
2197 if !device_rules.is_empty() {
2198 resources_builder = resources_builder.devices(device_rules);
2199 has_resources = true;
2200 }
2201
2202 if has_resources {
2203 let resources = resources_builder
2204 .build()
2205 .map_err(|e| AgentError::InvalidSpec(format!("failed to build resources: {e}")))?;
2206 Ok(Some(resources))
2207 } else {
2208 Ok(None)
2209 }
2210 }
2211
2212 #[allow(clippy::unused_self, clippy::too_many_lines)]
2214 fn build_device_cgroup_rules(
2215 &self,
2216 spec: &ServiceSpec,
2217 _gpu_indices: Option<&[u32]>,
2218 ) -> Result<Vec<oci_spec::runtime::LinuxDeviceCgroup>> {
2219 let mut rules = Vec::new();
2220
2221 if spec.privileged {
2222 let rule = LinuxDeviceCgroupBuilder::default()
2224 .allow(true)
2225 .access("rwm".to_string())
2226 .build()
2227 .map_err(|e| {
2228 AgentError::InvalidSpec(format!("failed to build device cgroup rule: {e}"))
2229 })?;
2230 rules.push(rule);
2231 } else {
2232 let deny_all = LinuxDeviceCgroupBuilder::default()
2234 .allow(false)
2235 .access("rwm".to_string())
2236 .build()
2237 .map_err(|e| AgentError::InvalidSpec(format!("failed to build deny rule: {e}")))?;
2238 rules.push(deny_all);
2239
2240 let standard_char_devices = [
2243 (1, 3, "rwm"), (1, 5, "rwm"), (1, 7, "rwm"), (1, 8, "rwm"), (1, 9, "rwm"), (5, 0, "rwm"), (5, 1, "rwm"), (5, 2, "rwm"), (136, -1, "rwm"), ];
2253
2254 for (major, minor, access) in standard_char_devices {
2255 let mut builder = LinuxDeviceCgroupBuilder::default()
2256 .allow(true)
2257 .typ(LinuxDeviceType::C)
2258 .major(i64::from(major))
2259 .access(access.to_string());
2260
2261 if minor >= 0 {
2262 builder = builder.minor(i64::from(minor));
2263 }
2264
2265 let rule = builder.build().map_err(|e| {
2266 AgentError::InvalidSpec(format!("failed to build char device rule: {e}"))
2267 })?;
2268 rules.push(rule);
2269 }
2270
2271 #[cfg(unix)]
2275 for device in &spec.devices {
2276 if let Ok((major, minor)) = get_device_major_minor(&device.path) {
2277 let dev_type = get_device_type(&device.path).unwrap_or(LinuxDeviceType::C);
2278
2279 let mut access = String::new();
2281 if device.read {
2282 access.push('r');
2283 }
2284 if device.write {
2285 access.push('w');
2286 }
2287 if device.mknod {
2288 access.push('m');
2289 }
2290 if access.is_empty() {
2291 access = "rw".to_string();
2292 }
2293
2294 let rule = LinuxDeviceCgroupBuilder::default()
2295 .allow(true)
2296 .typ(dev_type)
2297 .major(major)
2298 .minor(minor)
2299 .access(access)
2300 .build()
2301 .map_err(|e| {
2302 AgentError::InvalidSpec(format!(
2303 "failed to build device rule for {}: {}",
2304 device.path, e
2305 ))
2306 })?;
2307 rules.push(rule);
2308 } else {
2309 tracing::warn!("Failed to get device info for {}, skipping", device.path);
2310 }
2311 }
2312
2313 if let Some(ref gpu) = spec.resources.gpu {
2315 match gpu.vendor.as_str() {
2316 "nvidia" => {
2317 let rule = LinuxDeviceCgroupBuilder::default()
2319 .allow(true)
2320 .typ(LinuxDeviceType::C)
2321 .major(195i64)
2322 .access("rwm".to_string())
2323 .build()
2324 .map_err(|e| {
2325 AgentError::InvalidSpec(format!(
2326 "failed to build GPU cgroup rule: {e}"
2327 ))
2328 })?;
2329 rules.push(rule);
2330
2331 let uvm_rule = LinuxDeviceCgroupBuilder::default()
2333 .allow(true)
2334 .typ(LinuxDeviceType::C)
2335 .major(510i64)
2336 .access("rwm".to_string())
2337 .build()
2338 .map_err(|e| {
2339 AgentError::InvalidSpec(format!(
2340 "failed to build GPU UVM cgroup rule: {e}"
2341 ))
2342 })?;
2343 rules.push(uvm_rule);
2344 }
2345 "amd" => {
2346 let dri_rule = LinuxDeviceCgroupBuilder::default()
2348 .allow(true)
2349 .typ(LinuxDeviceType::C)
2350 .major(226i64)
2351 .access("rwm".to_string())
2352 .build()
2353 .map_err(|e| {
2354 AgentError::InvalidSpec(format!(
2355 "failed to build AMD DRI cgroup rule: {e}"
2356 ))
2357 })?;
2358 rules.push(dri_rule);
2359
2360 let kfd_rule = LinuxDeviceCgroupBuilder::default()
2362 .allow(true)
2363 .typ(LinuxDeviceType::C)
2364 .major(234i64)
2365 .access("rwm".to_string())
2366 .build()
2367 .map_err(|e| {
2368 AgentError::InvalidSpec(format!(
2369 "failed to build AMD KFD cgroup rule: {e}"
2370 ))
2371 })?;
2372 rules.push(kfd_rule);
2373 }
2374 "intel" => {
2375 let dri_rule = LinuxDeviceCgroupBuilder::default()
2377 .allow(true)
2378 .typ(LinuxDeviceType::C)
2379 .major(226i64)
2380 .access("rwm".to_string())
2381 .build()
2382 .map_err(|e| {
2383 AgentError::InvalidSpec(format!(
2384 "failed to build Intel DRI cgroup rule: {e}"
2385 ))
2386 })?;
2387 rules.push(dri_rule);
2388 }
2389 other => {
2390 tracing::warn!(
2392 vendor = %other,
2393 "Unknown GPU vendor, allowing DRI devices (major 226)"
2394 );
2395 let dri_rule = LinuxDeviceCgroupBuilder::default()
2396 .allow(true)
2397 .typ(LinuxDeviceType::C)
2398 .major(226i64)
2399 .access("rwm".to_string())
2400 .build()
2401 .map_err(|e| {
2402 AgentError::InvalidSpec(format!(
2403 "failed to build GPU DRI cgroup rule: {e}"
2404 ))
2405 })?;
2406 rules.push(dri_rule);
2407 }
2408 }
2409 }
2410 }
2411
2412 Ok(rules)
2413 }
2414
2415 #[allow(clippy::unused_self, clippy::too_many_lines)]
2424 #[cfg_attr(not(unix), allow(clippy::unnecessary_wraps, clippy::needless_return))]
2425 fn build_devices(
2426 &self,
2427 spec: &ServiceSpec,
2428 gpu_indices: Option<&[u32]>,
2429 skip_gpu_defaults: bool,
2430 ) -> Result<Vec<oci_spec::runtime::LinuxDevice>> {
2431 #[cfg(not(unix))]
2432 {
2433 let _ = (spec, gpu_indices, skip_gpu_defaults);
2434 return Ok(Vec::new());
2435 }
2436
2437 #[cfg(unix)]
2438 {
2439 let mut devices = Vec::new();
2440
2441 for device in &spec.devices {
2442 if let Ok((major, minor)) = get_device_major_minor(&device.path) {
2443 let dev_type = get_device_type(&device.path).unwrap_or(LinuxDeviceType::C);
2444
2445 let linux_device = LinuxDeviceBuilder::default()
2446 .path(device.path.clone())
2447 .typ(dev_type)
2448 .major(major)
2449 .minor(minor)
2450 .file_mode(0o666u32)
2451 .uid(0u32)
2452 .gid(0u32)
2453 .build()
2454 .map_err(|e| {
2455 AgentError::InvalidSpec(format!(
2456 "failed to build device {}: {}",
2457 device.path, e
2458 ))
2459 })?;
2460
2461 devices.push(linux_device);
2462 }
2463 }
2464
2465 if skip_gpu_defaults {
2470 return Ok(devices);
2471 }
2472
2473 if let Some(ref gpu) = spec.resources.gpu {
2475 let indices: Vec<u32> =
2476 gpu_indices.map_or_else(|| (0..gpu.count).collect(), <[u32]>::to_vec);
2477
2478 match gpu.vendor.as_str() {
2479 "nvidia" => {
2480 let always_devices =
2482 ["/dev/nvidiactl", "/dev/nvidia-uvm", "/dev/nvidia-uvm-tools"];
2483 for dev_path in &always_devices {
2484 if let Ok((major, minor)) = get_device_major_minor(dev_path) {
2485 let dev_type =
2486 get_device_type(dev_path).unwrap_or(LinuxDeviceType::C);
2487 let linux_device = LinuxDeviceBuilder::default()
2488 .path((*dev_path).to_string())
2489 .typ(dev_type)
2490 .major(major)
2491 .minor(minor)
2492 .file_mode(0o666u32)
2493 .uid(0u32)
2494 .gid(0u32)
2495 .build()
2496 .map_err(|e| {
2497 AgentError::InvalidSpec(format!(
2498 "failed to build GPU device {dev_path}: {e}"
2499 ))
2500 })?;
2501 devices.push(linux_device);
2502 } else {
2503 tracing::warn!(
2504 "GPU device {} not found on host, skipping",
2505 dev_path
2506 );
2507 }
2508 }
2509
2510 for i in &indices {
2512 let dev_path = format!("/dev/nvidia{i}");
2513 if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2514 let dev_type =
2515 get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2516 let linux_device = LinuxDeviceBuilder::default()
2517 .path(dev_path.clone())
2518 .typ(dev_type)
2519 .major(major)
2520 .minor(minor)
2521 .file_mode(0o666u32)
2522 .uid(0u32)
2523 .gid(0u32)
2524 .build()
2525 .map_err(|e| {
2526 AgentError::InvalidSpec(format!(
2527 "failed to build GPU device {dev_path}: {e}"
2528 ))
2529 })?;
2530 devices.push(linux_device);
2531 } else {
2532 tracing::warn!(
2533 "GPU device {} not found on host, skipping",
2534 dev_path
2535 );
2536 }
2537 }
2538 }
2539 "amd" => {
2540 let amd_always_devices = ["/dev/kfd"];
2542 for dev_path in &amd_always_devices {
2543 if let Ok((major, minor)) = get_device_major_minor(dev_path) {
2544 let dev_type =
2545 get_device_type(dev_path).unwrap_or(LinuxDeviceType::C);
2546 let linux_device = LinuxDeviceBuilder::default()
2547 .path((*dev_path).to_string())
2548 .typ(dev_type)
2549 .major(major)
2550 .minor(minor)
2551 .file_mode(0o666u32)
2552 .uid(0u32)
2553 .gid(0u32)
2554 .build()
2555 .map_err(|e| {
2556 AgentError::InvalidSpec(format!(
2557 "failed to build GPU device {dev_path}: {e}"
2558 ))
2559 })?;
2560 devices.push(linux_device);
2561 } else {
2562 tracing::warn!(
2563 "GPU device {} not found on host, skipping",
2564 dev_path
2565 );
2566 }
2567 }
2568
2569 for i in &indices {
2571 let dev_path = format!("/dev/dri/renderD{}", 128 + i);
2572 if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2573 let dev_type =
2574 get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2575 let linux_device = LinuxDeviceBuilder::default()
2576 .path(dev_path.clone())
2577 .typ(dev_type)
2578 .major(major)
2579 .minor(minor)
2580 .file_mode(0o666u32)
2581 .uid(0u32)
2582 .gid(0u32)
2583 .build()
2584 .map_err(|e| {
2585 AgentError::InvalidSpec(format!(
2586 "failed to build GPU device {dev_path}: {e}"
2587 ))
2588 })?;
2589 devices.push(linux_device);
2590 } else {
2591 tracing::warn!(
2592 "GPU device {} not found on host, skipping",
2593 dev_path
2594 );
2595 }
2596 }
2597
2598 for i in &indices {
2600 let dev_path = format!("/dev/dri/card{i}");
2601 if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2602 let dev_type =
2603 get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2604 let linux_device = LinuxDeviceBuilder::default()
2605 .path(dev_path.clone())
2606 .typ(dev_type)
2607 .major(major)
2608 .minor(minor)
2609 .file_mode(0o666u32)
2610 .uid(0u32)
2611 .gid(0u32)
2612 .build()
2613 .map_err(|e| {
2614 AgentError::InvalidSpec(format!(
2615 "failed to build GPU device {dev_path}: {e}"
2616 ))
2617 })?;
2618 devices.push(linux_device);
2619 } else {
2620 tracing::warn!(
2621 "GPU device {} not found on host, skipping",
2622 dev_path
2623 );
2624 }
2625 }
2626 }
2627 "intel" => {
2628 for i in &indices {
2630 let dev_path = format!("/dev/dri/renderD{}", 128 + i);
2631 if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2632 let dev_type =
2633 get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2634 let linux_device = LinuxDeviceBuilder::default()
2635 .path(dev_path.clone())
2636 .typ(dev_type)
2637 .major(major)
2638 .minor(minor)
2639 .file_mode(0o666u32)
2640 .uid(0u32)
2641 .gid(0u32)
2642 .build()
2643 .map_err(|e| {
2644 AgentError::InvalidSpec(format!(
2645 "failed to build GPU device {dev_path}: {e}"
2646 ))
2647 })?;
2648 devices.push(linux_device);
2649 } else {
2650 tracing::warn!(
2651 "GPU device {} not found on host, skipping",
2652 dev_path
2653 );
2654 }
2655 }
2656
2657 for i in &indices {
2659 let dev_path = format!("/dev/dri/card{i}");
2660 if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2661 let dev_type =
2662 get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2663 let linux_device = LinuxDeviceBuilder::default()
2664 .path(dev_path.clone())
2665 .typ(dev_type)
2666 .major(major)
2667 .minor(minor)
2668 .file_mode(0o666u32)
2669 .uid(0u32)
2670 .gid(0u32)
2671 .build()
2672 .map_err(|e| {
2673 AgentError::InvalidSpec(format!(
2674 "failed to build GPU device {dev_path}: {e}"
2675 ))
2676 })?;
2677 devices.push(linux_device);
2678 } else {
2679 tracing::warn!(
2680 "GPU device {} not found on host, skipping",
2681 dev_path
2682 );
2683 }
2684 }
2685 }
2686 other => {
2687 tracing::warn!(
2689 vendor = %other,
2690 "Unknown GPU vendor, attempting DRI device passthrough"
2691 );
2692 for i in &indices {
2693 let dev_path = format!("/dev/dri/renderD{}", 128 + i);
2694 if let Ok((major, minor)) = get_device_major_minor(&dev_path) {
2695 let dev_type =
2696 get_device_type(&dev_path).unwrap_or(LinuxDeviceType::C);
2697 let linux_device = LinuxDeviceBuilder::default()
2698 .path(dev_path.clone())
2699 .typ(dev_type)
2700 .major(major)
2701 .minor(minor)
2702 .file_mode(0o666u32)
2703 .uid(0u32)
2704 .gid(0u32)
2705 .build()
2706 .map_err(|e| {
2707 AgentError::InvalidSpec(format!(
2708 "failed to build GPU device {dev_path}: {e}"
2709 ))
2710 })?;
2711 devices.push(linux_device);
2712 } else {
2713 tracing::warn!(
2714 "GPU device {} not found on host, skipping",
2715 dev_path
2716 );
2717 }
2718 }
2719 }
2720 }
2721 }
2722
2723 Ok(devices)
2724 } }
2726
2727 pub async fn write_config(
2739 &self,
2740 container_id: &ContainerId,
2741 spec: &ServiceSpec,
2742 ) -> Result<PathBuf> {
2743 let oci_spec = self
2745 .build_spec_only(container_id, spec, &self.volume_paths)
2746 .await?;
2747
2748 let config_path = self.bundle_dir.join("config.json");
2750 let config_json =
2751 serde_json::to_string_pretty(&oci_spec).map_err(|e| AgentError::CreateFailed {
2752 id: container_id.to_string(),
2753 reason: format!("failed to serialize OCI spec: {e}"),
2754 })?;
2755
2756 fs::write(&config_path, config_json)
2757 .await
2758 .map_err(|e| AgentError::CreateFailed {
2759 id: container_id.to_string(),
2760 reason: format!("failed to write config.json: {e}"),
2761 })?;
2762
2763 tracing::debug!(
2764 "Wrote OCI config.json at {} for container {}",
2765 config_path.display(),
2766 container_id
2767 );
2768
2769 Ok(self.bundle_dir.clone())
2770 }
2771
2772 fn resolve_command_from_spec(
2781 spec: &ServiceSpec,
2782 image_config: Option<&zlayer_registry::ImageConfig>,
2783 ) -> Vec<String> {
2784 let mut args = Vec::new();
2785
2786 match (&spec.command.entrypoint, &spec.command.args) {
2787 (Some(entrypoint), Some(cmd_args)) => {
2788 args.extend_from_slice(entrypoint);
2789 args.extend_from_slice(cmd_args);
2790 }
2791 (Some(entrypoint), None) => {
2792 args.extend_from_slice(entrypoint);
2793 }
2794 (None, Some(cmd_args)) if !cmd_args.is_empty() => {
2795 args.extend_from_slice(cmd_args);
2796 }
2797 _ => {
2798 if let Some(img_cmd) =
2800 image_config.and_then(zlayer_registry::ImageConfig::full_command)
2801 {
2802 if img_cmd.is_empty() {
2803 args.push("/bin/sh".to_string());
2804 } else {
2805 args.extend(img_cmd);
2806 }
2807 } else {
2808 args.push("/bin/sh".to_string());
2809 }
2810 }
2811 }
2812
2813 args
2814 }
2815
2816 pub async fn cleanup(&self) -> Result<()> {
2823 if self.bundle_dir.exists() {
2824 fs::remove_dir_all(&self.bundle_dir)
2825 .await
2826 .map_err(|e| AgentError::CreateFailed {
2827 id: "cleanup".to_string(),
2828 reason: format!(
2829 "failed to remove bundle directory {}: {}",
2830 self.bundle_dir.display(),
2831 e
2832 ),
2833 })?;
2834 }
2835 Ok(())
2836 }
2837}
2838
2839#[cfg(unix)]
2852pub async fn create_bundle(
2853 container_id: &ContainerId,
2854 spec: &ServiceSpec,
2855 rootfs_path: Option<PathBuf>,
2856) -> Result<PathBuf> {
2857 let mut builder =
2858 BundleBuilder::for_container(container_id).with_host_network(spec.host_network);
2859
2860 if let Some(rootfs) = rootfs_path {
2861 builder = builder.with_rootfs(rootfs);
2862 }
2863
2864 builder.build(container_id, spec).await
2865}
2866
2867pub async fn cleanup_bundle(container_id: &ContainerId) -> Result<()> {
2874 let builder = BundleBuilder::for_container(container_id);
2875 builder.cleanup().await
2876}
2877
2878#[cfg(test)]
2879mod tests {
2880 use super::*;
2881 use zlayer_spec::*;
2882
2883 fn mock_spec() -> ServiceSpec {
2884 serde_yaml::from_str::<DeploymentSpec>(
2885 r"
2886version: v1
2887deployment: test
2888services:
2889 test:
2890 rtype: service
2891 image:
2892 name: test:latest
2893 endpoints:
2894 - name: http
2895 protocol: http
2896 port: 8080
2897",
2898 )
2899 .unwrap()
2900 .services
2901 .remove("test")
2902 .unwrap()
2903 }
2904
2905 #[cfg(target_os = "linux")]
2906 fn mock_spec_with_resources() -> ServiceSpec {
2907 serde_yaml::from_str::<DeploymentSpec>(
2908 r"
2909version: v1
2910deployment: test
2911services:
2912 test:
2913 rtype: service
2914 image:
2915 name: test:latest
2916 resources:
2917 cpu: 0.5
2918 memory: 512Mi
2919 env:
2920 MY_VAR: my_value
2921 ANOTHER: value2
2922 endpoints:
2923 - name: http
2924 protocol: http
2925 port: 8080
2926",
2927 )
2928 .unwrap()
2929 .services
2930 .remove("test")
2931 .unwrap()
2932 }
2933
2934 #[cfg(target_os = "linux")]
2935 fn mock_privileged_spec() -> ServiceSpec {
2936 serde_yaml::from_str::<DeploymentSpec>(
2937 r"
2938version: v1
2939deployment: test
2940services:
2941 test:
2942 rtype: service
2943 image:
2944 name: test:latest
2945 privileged: true
2946 endpoints:
2947 - name: http
2948 protocol: http
2949 port: 8080
2950",
2951 )
2952 .unwrap()
2953 .services
2954 .remove("test")
2955 .unwrap()
2956 }
2957
2958 #[test]
2959 fn test_parse_memory_string() {
2960 assert_eq!(parse_memory_string("512Mi").unwrap(), 512 * 1024 * 1024);
2961 assert_eq!(parse_memory_string("1Gi").unwrap(), 1024 * 1024 * 1024);
2962 assert_eq!(parse_memory_string("2G").unwrap(), 2 * 1000 * 1000 * 1000);
2963 assert_eq!(parse_memory_string("1024").unwrap(), 1024);
2964 assert_eq!(parse_memory_string("512Ki").unwrap(), 512 * 1024);
2965 }
2966
2967 #[test]
2968 fn test_parse_memory_string_errors() {
2969 assert!(parse_memory_string("").is_err());
2970 assert!(parse_memory_string("abc").is_err());
2971 assert!(parse_memory_string("12.5Mi").is_err());
2972 }
2973
2974 #[test]
2975 fn test_generate_resolv_conf_single_nameserver() {
2976 let out = generate_resolv_conf(&["10.42.0.1".to_string()]);
2977 assert_eq!(out, "nameserver 10.42.0.1\noptions edns0\n");
2978 }
2979
2980 #[test]
2981 fn test_generate_resolv_conf_two_nameservers() {
2982 let out = generate_resolv_conf(&["10.42.0.1".to_string(), "fd00::1".to_string()]);
2983 assert_eq!(
2984 out,
2985 "nameserver 10.42.0.1\nnameserver fd00::1\noptions edns0\n"
2986 );
2987 }
2988
2989 #[cfg(target_os = "linux")]
2990 #[tokio::test]
2991 async fn test_build_oci_spec_injects_resolv_conf_mount() {
2992 let dir = tempfile::tempdir().unwrap();
2993 let id = ContainerId::new("test".to_string(), 1);
2994 let mut spec = mock_spec();
2995 spec.dns = vec!["10.42.0.1".to_string()];
2996 let builder = BundleBuilder::new(dir.path().to_path_buf());
2997
2998 let oci_spec = builder
2999 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3000 .await
3001 .unwrap();
3002
3003 let mounts = oci_spec.mounts().as_ref().expect("mounts present");
3004 let resolv_mount = mounts
3005 .iter()
3006 .find(|m| m.destination() == Path::new("/etc/resolv.conf"))
3007 .expect("resolv.conf mount injected");
3008 let source = resolv_mount.source().as_ref().unwrap();
3009 let written = std::fs::read_to_string(source).unwrap();
3010 assert_eq!(written, "nameserver 10.42.0.1\noptions edns0\n");
3011 }
3012
3013 #[cfg(target_os = "linux")]
3014 #[tokio::test]
3015 async fn test_build_oci_spec_no_resolv_conf_when_dns_empty() {
3016 let dir = tempfile::tempdir().unwrap();
3017 let id = ContainerId::new("test".to_string(), 1);
3018 let spec = mock_spec(); let builder = BundleBuilder::new(dir.path().to_path_buf());
3020
3021 let oci_spec = builder
3022 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3023 .await
3024 .unwrap();
3025
3026 let mounts = oci_spec.mounts().as_ref().expect("mounts present");
3027 assert!(
3028 !mounts
3029 .iter()
3030 .any(|m| m.destination() == Path::new("/etc/resolv.conf")),
3031 "no resolv.conf mount should be injected for empty spec.dns"
3032 );
3033 }
3034
3035 #[cfg(target_os = "linux")]
3036 #[tokio::test]
3037 async fn test_build_oci_spec_no_resolv_conf_when_host_network() {
3038 let dir = tempfile::tempdir().unwrap();
3039 let id = ContainerId::new("test".to_string(), 1);
3040 let mut spec = mock_spec();
3041 spec.dns = vec!["10.42.0.1".to_string()];
3042 spec.host_network = true;
3043 let builder = BundleBuilder::new(dir.path().to_path_buf());
3044
3045 let oci_spec = builder
3046 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3047 .await
3048 .unwrap();
3049
3050 let mounts = oci_spec.mounts().as_ref().expect("mounts present");
3051 assert!(
3052 !mounts
3053 .iter()
3054 .any(|m| m.destination() == Path::new("/etc/resolv.conf")),
3055 "host_network containers must inherit the host resolv.conf"
3056 );
3057 }
3058
3059 #[test]
3060 fn test_bundle_builder_new() {
3061 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3062 assert_eq!(builder.bundle_dir(), Path::new("/tmp/test-bundle"));
3063 assert!(builder.rootfs_path.is_none());
3064 }
3065
3066 #[test]
3067 fn test_bundle_builder_for_container() {
3068 let dirs = zlayer_paths::ZLayerDirs::system_default();
3069 let id = ContainerId::new("myservice".to_string(), 1);
3070 let builder = BundleBuilder::for_container(&id);
3071 assert_eq!(builder.bundle_dir(), dirs.bundles().join("myservice-rep-1"));
3072 }
3073
3074 #[test]
3075 fn test_bundle_builder_with_rootfs() {
3076 let dirs = zlayer_paths::ZLayerDirs::system_default();
3077 let builder = BundleBuilder::new("/tmp/test-bundle".into())
3078 .with_rootfs(dirs.rootfs().join("myimage"));
3079 assert_eq!(builder.rootfs_path, Some(dirs.rootfs().join("myimage")));
3080 }
3081
3082 #[cfg(target_os = "linux")]
3083 #[tokio::test]
3084 async fn test_build_oci_spec_basic() {
3085 let id = ContainerId::new("test".to_string(), 1);
3086 let spec = mock_spec();
3087 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3088
3089 let oci_spec = builder
3090 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3091 .await
3092 .unwrap();
3093
3094 assert_eq!(oci_spec.version(), "1.0.2");
3095 assert!(oci_spec.root().is_some());
3096 assert_eq!(
3097 oci_spec.root().as_ref().unwrap().path(),
3098 std::path::Path::new("rootfs")
3099 );
3100 assert!(oci_spec.process().is_some());
3101 assert!(oci_spec.linux().is_some());
3102 }
3103
3104 #[cfg(target_os = "linux")]
3105 #[tokio::test]
3106 async fn test_build_oci_spec_with_resources() {
3107 let id = ContainerId::new("test".to_string(), 1);
3108 let spec = mock_spec_with_resources();
3109 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3110
3111 let oci_spec = builder
3112 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3113 .await
3114 .unwrap();
3115
3116 let linux = oci_spec.linux().as_ref().unwrap();
3118 let resources = linux.resources().as_ref().unwrap();
3119
3120 let cpu = resources.cpu().as_ref().unwrap();
3122 assert_eq!(cpu.quota(), Some(50_000)); assert_eq!(cpu.period(), Some(100_000));
3124
3125 let memory = resources.memory().as_ref().unwrap();
3127 assert_eq!(memory.limit(), Some(512 * 1024 * 1024)); }
3129
3130 #[cfg(target_os = "linux")]
3131 #[tokio::test]
3132 async fn test_build_oci_spec_translates_ulimits() {
3133 let id = ContainerId::new("test".to_string(), 1);
3134 let mut spec = mock_spec();
3135 spec.ulimits.insert(
3136 "nofile".to_string(),
3137 UlimitSpec {
3138 soft: 100_000,
3139 hard: 200_000,
3140 },
3141 );
3142 spec.ulimits
3144 .insert("nproc".to_string(), UlimitSpec { soft: -1, hard: -5 });
3145 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3146
3147 let oci_spec = builder
3148 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3149 .await
3150 .unwrap();
3151
3152 let process = oci_spec.process().as_ref().expect("process present");
3153 let rlimits = process.rlimits().as_ref().expect("rlimits present");
3154
3155 let nofile: Vec<_> = rlimits
3159 .iter()
3160 .filter(|r| r.typ() == PosixRlimitType::RlimitNofile)
3161 .collect();
3162 assert_eq!(nofile.len(), 1, "nofile must not be duplicated");
3163 assert_eq!(nofile[0].soft(), 100_000);
3164 assert_eq!(nofile[0].hard(), 200_000);
3165
3166 let nproc = rlimits
3167 .iter()
3168 .find(|r| r.typ() == PosixRlimitType::RlimitNproc)
3169 .expect("nproc rlimit present");
3170 assert_eq!(nproc.soft(), 0, "negative soft clamps to 0");
3171 assert_eq!(nproc.hard(), 0, "negative hard clamps to 0");
3172 }
3173
3174 #[cfg(target_os = "linux")]
3175 #[tokio::test]
3176 async fn test_build_oci_spec_rejects_unknown_ulimit() {
3177 let id = ContainerId::new("test".to_string(), 1);
3178 let mut spec = mock_spec();
3179 spec.ulimits.insert(
3180 "not_a_real_ulimit".to_string(),
3181 UlimitSpec { soft: 1, hard: 1 },
3182 );
3183 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3184
3185 let err = builder
3186 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3187 .await
3188 .expect_err("unknown ulimit name must be rejected");
3189 assert!(
3190 err.to_string().contains("not_a_real_ulimit"),
3191 "error should name the unknown ulimit: {err}"
3192 );
3193 }
3194
3195 #[cfg(target_os = "linux")]
3196 #[tokio::test]
3197 async fn test_build_oci_spec_keeps_oci_default_rlimits_when_ulimits_empty() {
3198 let id = ContainerId::new("test".to_string(), 1);
3204 let spec = mock_spec();
3205 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3206
3207 let oci_spec = builder
3208 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3209 .await
3210 .unwrap();
3211
3212 let process = oci_spec.process().as_ref().expect("process present");
3213 let rlimits = process
3214 .rlimits()
3215 .as_ref()
3216 .expect("oci default rlimits present");
3217 let nofile = rlimits
3218 .iter()
3219 .find(|r| r.typ() == PosixRlimitType::RlimitNofile)
3220 .expect("default nofile rlimit present");
3221 assert_eq!(nofile.soft(), 1024);
3224 assert_eq!(nofile.hard(), 1024);
3225 }
3226
3227 #[cfg(target_os = "linux")]
3228 #[tokio::test]
3229 async fn test_build_oci_spec_privileged() {
3230 let id = ContainerId::new("test".to_string(), 1);
3231 let spec = mock_privileged_spec();
3232 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3233
3234 let oci_spec = builder
3235 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3236 .await
3237 .unwrap();
3238
3239 let process = oci_spec.process().as_ref().unwrap();
3241 let caps = process.capabilities().as_ref().unwrap();
3242 let bounding = caps.bounding().as_ref().unwrap();
3243
3244 assert!(bounding.contains(&Capability::SysAdmin));
3246 assert!(bounding.contains(&Capability::NetAdmin));
3247
3248 let linux = oci_spec.linux().as_ref().unwrap();
3250 assert!(
3251 linux.masked_paths().is_none() || linux.masked_paths().as_ref().unwrap().is_empty()
3252 );
3253 }
3254
3255 #[cfg(target_os = "linux")]
3256 #[tokio::test]
3257 async fn test_build_oci_spec_environment() {
3258 let id = ContainerId::new("test".to_string(), 1);
3259 let spec = mock_spec_with_resources();
3260 let builder = BundleBuilder::new("/tmp/test-bundle".into())
3261 .with_env("EXTRA_VAR".to_string(), "extra_value".to_string());
3262
3263 let oci_spec = builder
3264 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3265 .await
3266 .unwrap();
3267
3268 let process = oci_spec.process().as_ref().unwrap();
3269 let env = process.env().as_ref().unwrap();
3270
3271 assert!(env.iter().any(|e| e == "MY_VAR=my_value"));
3273 assert!(env.iter().any(|e| e == "ANOTHER=value2"));
3274 assert!(env.iter().any(|e| e == "EXTRA_VAR=extra_value"));
3276 assert!(env.iter().any(|e| e.starts_with("PATH=")));
3278 }
3279
3280 #[cfg(target_os = "linux")]
3281 #[tokio::test]
3282 async fn test_build_namespaces() {
3283 let id = ContainerId::new("test".to_string(), 1);
3284 let spec = mock_spec();
3285 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3286
3287 let oci_spec = builder
3288 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3289 .await
3290 .unwrap();
3291 let linux = oci_spec.linux().as_ref().unwrap();
3292 let namespaces = linux.namespaces().as_ref().unwrap();
3293
3294 let namespace_types: Vec<_> = namespaces
3296 .iter()
3297 .map(oci_spec::runtime::LinuxNamespace::typ)
3298 .collect();
3299 assert!(namespace_types.contains(&LinuxNamespaceType::Pid));
3300 assert!(namespace_types.contains(&LinuxNamespaceType::Ipc));
3301 assert!(namespace_types.contains(&LinuxNamespaceType::Uts));
3302 assert!(namespace_types.contains(&LinuxNamespaceType::Mount));
3303 assert!(namespace_types.contains(&LinuxNamespaceType::Network));
3304 }
3305
3306 #[cfg(target_os = "linux")]
3307 #[tokio::test]
3308 async fn test_build_namespaces_host_network() {
3309 let id = ContainerId::new("test".to_string(), 1);
3310 let spec = mock_spec();
3311 let builder = BundleBuilder::new("/tmp/test-bundle".into()).with_host_network(true);
3312
3313 let oci_spec = builder
3314 .build_spec_only(&id, &spec, &std::collections::HashMap::new())
3315 .await
3316 .unwrap();
3317 let linux = oci_spec.linux().as_ref().unwrap();
3318 let namespaces = linux.namespaces().as_ref().unwrap();
3319
3320 let namespace_types: Vec<_> = namespaces
3322 .iter()
3323 .map(oci_spec::runtime::LinuxNamespace::typ)
3324 .collect();
3325 assert!(namespace_types.contains(&LinuxNamespaceType::Pid));
3326 assert!(namespace_types.contains(&LinuxNamespaceType::Ipc));
3327 assert!(namespace_types.contains(&LinuxNamespaceType::Uts));
3328 assert!(namespace_types.contains(&LinuxNamespaceType::Mount));
3329 assert!(
3330 !namespace_types.contains(&LinuxNamespaceType::Network),
3331 "Network namespace should NOT be present in host_network mode"
3332 );
3333 }
3334
3335 #[test]
3336 fn test_build_default_mounts() {
3337 let spec = mock_spec();
3338 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3339
3340 let mounts = builder.build_default_mounts(&spec).unwrap();
3341
3342 let mount_destinations: Vec<_> = mounts
3344 .iter()
3345 .map(|m| m.destination().to_string_lossy().to_string())
3346 .collect();
3347 assert!(mount_destinations.contains(&"/proc".to_string()));
3348 assert!(mount_destinations.contains(&"/dev".to_string()));
3349 assert!(mount_destinations.contains(&"/dev/pts".to_string()));
3350 assert!(mount_destinations.contains(&"/dev/shm".to_string()));
3351 assert!(mount_destinations.contains(&"/sys".to_string()));
3352 }
3353
3354 #[test]
3355 fn test_build_storage_mounts_bind() {
3356 let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3357 r"
3358version: v1
3359deployment: test
3360services:
3361 test:
3362 image:
3363 name: test:latest
3364 storage:
3365 - type: bind
3366 source: /host/data
3367 target: /app/data
3368 readonly: true
3369",
3370 )
3371 .unwrap()
3372 .services
3373 .remove("test")
3374 .unwrap();
3375
3376 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3377 let volume_paths = std::collections::HashMap::new();
3378
3379 let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3380
3381 assert_eq!(mounts.len(), 1);
3382 assert_eq!(mounts[0].destination().to_string_lossy(), "/app/data");
3383 assert_eq!(
3384 mounts[0]
3385 .source()
3386 .as_ref()
3387 .map(|s| s.to_string_lossy().to_string()),
3388 Some("/host/data".to_string())
3389 );
3390 let options = mounts[0].options().as_ref().unwrap();
3391 assert!(options.contains(&"rbind".to_string()));
3392 assert!(options.contains(&"ro".to_string()));
3393 }
3394
3395 #[test]
3396 fn test_build_storage_mounts_named() {
3397 let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3398 r"
3399version: v1
3400deployment: test
3401services:
3402 test:
3403 image:
3404 name: test:latest
3405 storage:
3406 - type: named
3407 name: my-volume
3408 target: /app/data
3409",
3410 )
3411 .unwrap()
3412 .services
3413 .remove("test")
3414 .unwrap();
3415
3416 let dirs = zlayer_paths::ZLayerDirs::system_default();
3417 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3418 let mut volume_paths = std::collections::HashMap::new();
3419 volume_paths.insert("my-volume".to_string(), dirs.volumes().join("my-volume"));
3420
3421 let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3422
3423 assert_eq!(mounts.len(), 1);
3424 assert_eq!(mounts[0].destination().to_string_lossy(), "/app/data");
3425 assert_eq!(
3426 mounts[0]
3427 .source()
3428 .as_ref()
3429 .map(|s| s.to_string_lossy().to_string()),
3430 Some(
3431 dirs.volumes()
3432 .join("my-volume")
3433 .to_string_lossy()
3434 .into_owned()
3435 )
3436 );
3437 }
3438
3439 #[test]
3440 fn test_build_storage_mounts_tmpfs() {
3441 let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3442 r"
3443version: v1
3444deployment: test
3445services:
3446 test:
3447 image:
3448 name: test:latest
3449 storage:
3450 - type: tmpfs
3451 target: /app/tmp
3452 size: 256Mi
3453 mode: 1777
3454",
3455 )
3456 .unwrap()
3457 .services
3458 .remove("test")
3459 .unwrap();
3460
3461 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3462 let volume_paths = std::collections::HashMap::new();
3463
3464 let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3465
3466 assert_eq!(mounts.len(), 1);
3467 assert_eq!(mounts[0].destination().to_string_lossy(), "/app/tmp");
3468 assert_eq!(mounts[0].typ().as_ref().map(String::as_str), Some("tmpfs"));
3469 let options = mounts[0].options().as_ref().unwrap();
3470 assert!(options.iter().any(|o| o.starts_with("size=")));
3471 assert!(options.iter().any(|o| o.starts_with("mode=")));
3472 }
3473
3474 #[test]
3475 fn test_build_storage_mounts_multiple() {
3476 let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3477 r"
3478version: v1
3479deployment: test
3480services:
3481 test:
3482 image:
3483 name: test:latest
3484 storage:
3485 - type: bind
3486 source: /etc/config
3487 target: /app/config
3488 readonly: true
3489 - type: named
3490 name: app-data
3491 target: /app/data
3492 - type: tmpfs
3493 target: /app/tmp
3494",
3495 )
3496 .unwrap()
3497 .services
3498 .remove("test")
3499 .unwrap();
3500
3501 let dirs = zlayer_paths::ZLayerDirs::system_default();
3502 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3503 let mut volume_paths = std::collections::HashMap::new();
3504 volume_paths.insert("app-data".to_string(), dirs.volumes().join("app-data"));
3505
3506 let mounts = builder.build_storage_mounts(&spec, &volume_paths).unwrap();
3507
3508 assert_eq!(mounts.len(), 3);
3509
3510 let destinations: Vec<String> = mounts
3512 .iter()
3513 .map(|m| m.destination().to_string_lossy().to_string())
3514 .collect();
3515 assert!(destinations.contains(&"/app/config".to_string()));
3516 assert!(destinations.contains(&"/app/data".to_string()));
3517 assert!(destinations.contains(&"/app/tmp".to_string()));
3518 }
3519
3520 #[test]
3521 fn test_build_storage_mounts_anonymous_missing_path() {
3522 let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3523 r"
3524version: v1
3525deployment: test
3526services:
3527 test:
3528 image:
3529 name: test:latest
3530 storage:
3531 - type: anonymous
3532 target: /app/cache
3533",
3534 )
3535 .unwrap()
3536 .services
3537 .remove("test")
3538 .unwrap();
3539
3540 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3541 let volume_paths = std::collections::HashMap::new(); let result = builder.build_storage_mounts(&spec, &volume_paths);
3544
3545 assert!(result.is_err());
3547 }
3548
3549 #[cfg(target_os = "linux")]
3550 #[tokio::test]
3551 async fn test_oci_spec_includes_storage_mounts() {
3552 let id = ContainerId::new("test".to_string(), 1);
3553 let spec = serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(
3554 r"
3555version: v1
3556deployment: test
3557services:
3558 test:
3559 image:
3560 name: test:latest
3561 storage:
3562 - type: bind
3563 source: /host/data
3564 target: /app/data
3565 - type: tmpfs
3566 target: /app/tmp
3567",
3568 )
3569 .unwrap()
3570 .services
3571 .remove("test")
3572 .unwrap();
3573
3574 let builder = BundleBuilder::new("/tmp/test-bundle".into());
3575 let volume_paths = std::collections::HashMap::new();
3576
3577 let oci_spec = builder
3578 .build_spec_only(&id, &spec, &volume_paths)
3579 .await
3580 .unwrap();
3581
3582 let mounts = oci_spec.mounts().as_ref().unwrap();
3584 let destinations: Vec<String> = mounts
3585 .iter()
3586 .map(|m| m.destination().to_string_lossy().to_string())
3587 .collect();
3588
3589 assert!(destinations.contains(&"/proc".to_string())); assert!(destinations.contains(&"/dev".to_string())); assert!(destinations.contains(&"/app/data".to_string())); assert!(destinations.contains(&"/app/tmp".to_string())); }
3595
3596 fn mock_gpu_spec(vendor: &str, count: u32) -> ServiceSpec {
3597 let yaml = format!(
3598 "
3599version: v1
3600deployment: test
3601services:
3602 test:
3603 rtype: service
3604 image:
3605 name: test:latest
3606 resources:
3607 gpu:
3608 count: {count}
3609 vendor: {vendor}
3610 endpoints:
3611 - name: http
3612 protocol: http
3613 port: 8080
3614"
3615 );
3616 serde_yaml::from_str::<DeploymentSpec>(&yaml)
3617 .unwrap()
3618 .services
3619 .remove("test")
3620 .unwrap()
3621 }
3622
3623 fn write_nvidia_cdi_fixture(dir: &std::path::Path, json: &str) {
3624 std::fs::write(dir.join("nvidia.json"), json).unwrap();
3625 }
3626
3627 fn nvidia_cdi_fixture() -> &'static str {
3628 r#"{
3629 "cdiVersion": "0.6.0",
3630 "kind": "nvidia.com/gpu",
3631 "devices": [{
3632 "name": "0",
3633 "containerEdits": {
3634 "deviceNodes": [
3635 {"path": "/dev/nvidia0", "type": "c", "major": 195, "minor": 0}
3636 ],
3637 "env": ["NVIDIA_VISIBLE_DEVICES=0"],
3638 "hooks": {
3639 "createContainer": [{
3640 "path": "/usr/bin/nvidia-container-runtime-hook",
3641 "args": ["nvidia-container-runtime-hook", "prestart"]
3642 }]
3643 }
3644 }
3645 }]
3646 }"#
3647 }
3648
3649 #[cfg(target_os = "linux")]
3650 #[tokio::test]
3651 async fn gpu_spec_translates_to_cdi_device_nodes() {
3652 let dir = tempfile::tempdir().unwrap();
3653 write_nvidia_cdi_fixture(dir.path(), nvidia_cdi_fixture());
3654 let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3655
3656 let id = ContainerId::new("test".to_string(), 1);
3657 let spec = mock_gpu_spec("nvidia", 1);
3658 let builder = BundleBuilder::new("/tmp/test-bundle-cdi".into()).with_cdi_registry(registry);
3659
3660 let oci_spec = builder
3661 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3662 .await
3663 .expect("build with CDI fixture");
3664
3665 let linux = oci_spec.linux().as_ref().expect("linux config present");
3667 let devices = linux.devices().as_ref().expect("devices present");
3668 assert!(
3669 devices
3670 .iter()
3671 .any(|d| d.path() == std::path::Path::new("/dev/nvidia0")),
3672 "expected /dev/nvidia0 from CDI fixture; got {:?}",
3673 devices
3674 .iter()
3675 .map(oci_spec::runtime::LinuxDevice::path)
3676 .collect::<Vec<_>>()
3677 );
3678
3679 let process = oci_spec.process().as_ref().expect("process present");
3681 let env = process.env().as_ref().expect("env present");
3682 assert!(
3683 env.iter().any(|e| e == "NVIDIA_VISIBLE_DEVICES=0"),
3684 "expected NVIDIA_VISIBLE_DEVICES=0 in env; got {env:?}"
3685 );
3686
3687 let hooks = oci_spec.hooks().as_ref().expect("hooks present");
3689 let create_container = hooks
3690 .create_container()
3691 .as_ref()
3692 .expect("createContainer hooks present");
3693 assert_eq!(create_container.len(), 1);
3694 assert_eq!(
3695 create_container[0].path(),
3696 &std::path::PathBuf::from("/usr/bin/nvidia-container-runtime-hook")
3697 );
3698 }
3699
3700 #[tokio::test]
3701 async fn gpu_spec_with_missing_cdi_returns_error() {
3702 let dir = tempfile::tempdir().unwrap();
3704 let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3705
3706 let id = ContainerId::new("test".to_string(), 1);
3707 let spec = mock_gpu_spec("nvidia", 1);
3708 let builder =
3709 BundleBuilder::new("/tmp/test-bundle-cdi-missing".into()).with_cdi_registry(registry);
3710
3711 let err = builder
3712 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3713 .await
3714 .expect_err("should fail when CDI registry is empty");
3715
3716 match err {
3717 AgentError::InvalidSpec(msg) => {
3718 assert!(
3719 msg.contains("nvidia") || msg.contains("CDI"),
3720 "error should mention CDI / vendor; got: {msg}"
3721 );
3722 }
3723 other => panic!("expected InvalidSpec, got {other:?}"),
3724 }
3725 }
3726
3727 #[tokio::test]
3728 async fn gpu_spec_with_unknown_device_returns_error() {
3729 let dir = tempfile::tempdir().unwrap();
3732 write_nvidia_cdi_fixture(dir.path(), nvidia_cdi_fixture());
3733 let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3734
3735 let id = ContainerId::new("test".to_string(), 1);
3736 let spec = mock_gpu_spec("nvidia", 2);
3737 let builder =
3738 BundleBuilder::new("/tmp/test-bundle-cdi-unknown".into()).with_cdi_registry(registry);
3739
3740 let err = builder
3741 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3742 .await
3743 .expect_err("should fail when device '1' is not declared");
3744 match err {
3745 AgentError::InvalidSpec(msg) => {
3746 assert!(
3747 msg.contains("'1'") || msg.contains("device"),
3748 "error should mention the missing device; got: {msg}"
3749 );
3750 }
3751 other => panic!("expected InvalidSpec, got {other:?}"),
3752 }
3753 }
3754
3755 #[cfg(target_os = "linux")]
3756 #[tokio::test]
3757 async fn gpu_spec_with_all_devices_expands_to_all_in_spec() {
3758 let dir = tempfile::tempdir().unwrap();
3760 let fixture = r#"{
3761 "cdiVersion": "0.6.0",
3762 "kind": "nvidia.com/gpu",
3763 "devices": [
3764 {
3765 "name": "0",
3766 "containerEdits": {
3767 "env": ["NVIDIA_VISIBLE_DEVICES=0"],
3768 "deviceNodes": [
3769 {"path": "/dev/nvidia0", "type": "c", "major": 195, "minor": 0}
3770 ]
3771 }
3772 },
3773 {
3774 "name": "1",
3775 "containerEdits": {
3776 "env": ["NVIDIA_VISIBLE_DEVICES=1"],
3777 "deviceNodes": [
3778 {"path": "/dev/nvidia1", "type": "c", "major": 195, "minor": 1}
3779 ]
3780 }
3781 }
3782 ]
3783 }"#;
3784 write_nvidia_cdi_fixture(dir.path(), fixture);
3785 let registry = std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir.path()]));
3786
3787 let edits = registry
3790 .resolve_for_kind("nvidia.com/gpu", &["all".to_string()])
3791 .expect("resolve all");
3792 assert_eq!(edits.len(), 2);
3793
3794 let id = ContainerId::new("test".to_string(), 1);
3797 let spec = mock_gpu_spec("nvidia", 2);
3798 let builder =
3799 BundleBuilder::new("/tmp/test-bundle-cdi-all".into()).with_cdi_registry(registry);
3800
3801 let oci_spec = builder
3802 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3803 .await
3804 .expect("build with 2-device fixture");
3805
3806 let devices = oci_spec
3807 .linux()
3808 .as_ref()
3809 .unwrap()
3810 .devices()
3811 .as_ref()
3812 .expect("devices present");
3813 let paths: Vec<_> = devices.iter().map(|d| d.path().clone()).collect();
3814 assert!(paths.contains(&std::path::PathBuf::from("/dev/nvidia0")));
3815 assert!(paths.contains(&std::path::PathBuf::from("/dev/nvidia1")));
3816 }
3817
3818 fn build_nvidia_cdi_registry(dir: &std::path::Path) -> std::sync::Arc<crate::cdi::CdiRegistry> {
3823 write_nvidia_cdi_fixture(dir, nvidia_cdi_fixture());
3824 std::sync::Arc::new(crate::cdi::CdiRegistry::discover_from(&[dir]))
3825 }
3826
3827 #[cfg(target_os = "linux")]
3828 #[tokio::test]
3829 async fn gpu_spec_with_mps_sharing_injects_env_and_mounts() {
3830 let cdi_dir = tempfile::tempdir().unwrap();
3834 let mps_root = tempfile::tempdir().unwrap();
3835 let pipe_dir = mps_root.path().join("nvidia-mps");
3836 let log_dir = mps_root.path().join("nvidia-log");
3837 std::fs::create_dir(&pipe_dir).unwrap();
3838 std::fs::create_dir(&log_dir).unwrap();
3839 let registry = build_nvidia_cdi_registry(cdi_dir.path());
3840
3841 let id = ContainerId::new("test".to_string(), 1);
3842 let mut spec = mock_gpu_spec("nvidia", 1);
3843 let gpu = spec.resources.gpu.as_mut().expect("gpu spec set");
3844 gpu.sharing = Some(zlayer_spec::GpuSharingMode::Mps);
3845 gpu.mps_pipe_dir = Some(pipe_dir.to_string_lossy().into_owned());
3846 gpu.mps_log_dir = Some(log_dir.to_string_lossy().into_owned());
3847
3848 let builder =
3849 BundleBuilder::new("/tmp/test-bundle-mps-env".into()).with_cdi_registry(registry);
3850 let oci_spec = builder
3851 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3852 .await
3853 .expect("build with MPS sharing");
3854
3855 let env = oci_spec
3856 .process()
3857 .as_ref()
3858 .and_then(|p| p.env().as_ref())
3859 .expect("env present");
3860 let pipe_expect = format!("CUDA_MPS_PIPE_DIRECTORY={}", pipe_dir.display());
3861 let log_expect = format!("CUDA_MPS_LOG_DIRECTORY={}", log_dir.display());
3862 assert!(
3863 env.iter().any(|e| e == &pipe_expect),
3864 "expected {pipe_expect} in env; got {env:?}"
3865 );
3866 assert!(
3867 env.iter().any(|e| e == &log_expect),
3868 "expected {log_expect} in env; got {env:?}"
3869 );
3870
3871 let mounts = oci_spec.mounts().as_ref().expect("mounts present");
3872 assert!(
3873 mounts
3874 .iter()
3875 .any(|m| m.destination() == &pipe_dir && m.source().as_ref() == Some(&pipe_dir)),
3876 "expected bind mount of MPS pipe dir {}; got destinations {:?}",
3877 pipe_dir.display(),
3878 mounts.iter().map(Mount::destination).collect::<Vec<_>>()
3879 );
3880 assert!(
3881 mounts
3882 .iter()
3883 .any(|m| m.destination() == &log_dir && m.source().as_ref() == Some(&log_dir)),
3884 "expected bind mount of MPS log dir {}",
3885 log_dir.display()
3886 );
3887 }
3888
3889 #[tokio::test]
3890 async fn gpu_spec_with_mps_sharing_fails_when_pipe_dir_missing() {
3891 let cdi_dir = tempfile::tempdir().unwrap();
3892 let registry = build_nvidia_cdi_registry(cdi_dir.path());
3893
3894 let id = ContainerId::new("test".to_string(), 1);
3895 let mut spec = mock_gpu_spec("nvidia", 1);
3896 let gpu = spec.resources.gpu.as_mut().expect("gpu spec set");
3897 gpu.sharing = Some(zlayer_spec::GpuSharingMode::Mps);
3898 let missing = tempfile::tempdir().unwrap();
3901 let missing_path = missing.path().join("definitely-not-here");
3902 gpu.mps_pipe_dir = Some(missing_path.to_string_lossy().into_owned());
3903
3904 let builder =
3905 BundleBuilder::new("/tmp/test-bundle-mps-missing".into()).with_cdi_registry(registry);
3906 let err = builder
3907 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3908 .await
3909 .expect_err("should fail when MPS pipe dir is missing");
3910 match err {
3911 AgentError::GpuSharingUnavailable { mode, reason } => {
3912 assert_eq!(mode, "mps");
3913 assert!(
3914 reason.contains("pipe") || reason.contains(&missing_path.display().to_string()),
3915 "reason should mention the missing path; got: {reason}"
3916 );
3917 }
3918 other => panic!("expected GpuSharingUnavailable, got {other:?}"),
3919 }
3920 }
3921
3922 #[cfg(target_os = "linux")]
3923 #[tokio::test]
3924 async fn gpu_spec_with_timeslicing_injects_visible_devices() {
3925 let cdi_dir = tempfile::tempdir().unwrap();
3926 let registry = build_nvidia_cdi_registry(cdi_dir.path());
3927
3928 let id = ContainerId::new("test".to_string(), 1);
3929 let mut spec = mock_gpu_spec("nvidia", 1);
3930 let gpu = spec.resources.gpu.as_mut().expect("gpu spec set");
3931 gpu.sharing = Some(zlayer_spec::GpuSharingMode::TimeSlice);
3932 gpu.time_slice_index = Some(2);
3933
3934 let builder =
3935 BundleBuilder::new("/tmp/test-bundle-timeslice".into()).with_cdi_registry(registry);
3936 let oci_spec = builder
3937 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3938 .await
3939 .expect("build with time-slicing");
3940
3941 let env = oci_spec
3942 .process()
3943 .as_ref()
3944 .and_then(|p| p.env().as_ref())
3945 .expect("env present");
3946 let cuda_entries: Vec<&String> = env
3949 .iter()
3950 .filter(|e| e.starts_with("CUDA_VISIBLE_DEVICES="))
3951 .collect();
3952 assert_eq!(
3953 cuda_entries.len(),
3954 1,
3955 "exactly one CUDA_VISIBLE_DEVICES expected; got {cuda_entries:?}"
3956 );
3957 assert_eq!(cuda_entries[0], "CUDA_VISIBLE_DEVICES=2");
3958 }
3959
3960 #[cfg(target_os = "linux")]
3961 #[tokio::test]
3962 async fn gpu_spec_without_sharing_omits_mps_env() {
3963 let cdi_dir = tempfile::tempdir().unwrap();
3964 let registry = build_nvidia_cdi_registry(cdi_dir.path());
3965
3966 let id = ContainerId::new("test".to_string(), 1);
3967 let spec = mock_gpu_spec("nvidia", 1);
3968 assert!(spec.resources.gpu.as_ref().unwrap().sharing.is_none());
3969
3970 let builder =
3971 BundleBuilder::new("/tmp/test-bundle-no-sharing".into()).with_cdi_registry(registry);
3972 let oci_spec = builder
3973 .build_oci_spec(&id, &spec, &std::collections::HashMap::new())
3974 .await
3975 .expect("build without sharing");
3976
3977 let env = oci_spec
3978 .process()
3979 .as_ref()
3980 .and_then(|p| p.env().as_ref())
3981 .expect("env present");
3982 assert!(
3983 !env.iter().any(|e| e.starts_with("CUDA_MPS_")),
3984 "no CUDA_MPS_* env should be present without sharing; got {env:?}"
3985 );
3986
3987 let mounts = oci_spec.mounts().as_ref().expect("mounts present");
3991 assert!(
3992 !mounts
3993 .iter()
3994 .any(|m| { m.destination().to_string_lossy().contains("nvidia-mps") }),
3995 "no MPS pipe mount should be present without sharing"
3996 );
3997 }
3998
3999 #[cfg(unix)]
4000 mod subid_tests {
4001 use super::super::read_subid_range;
4002 use std::io::Write;
4003
4004 #[test]
4005 fn read_subid_range_returns_range_for_user() {
4006 let mut tmp = tempfile::NamedTempFile::new().unwrap();
4007 writeln!(tmp, "alice:100000:65536").unwrap();
4008 writeln!(tmp, "bob:165536:65536").unwrap();
4009 tmp.flush().unwrap();
4010 let path = tmp.path().to_str().unwrap();
4011 assert_eq!(read_subid_range(path, "bob"), Some((165_536, 65_536)));
4012 assert_eq!(read_subid_range(path, "alice"), Some((100_000, 65_536)));
4013 }
4014
4015 #[test]
4016 fn read_subid_range_returns_none_for_unknown_user() {
4017 let mut tmp = tempfile::NamedTempFile::new().unwrap();
4018 writeln!(tmp, "alice:100000:65536").unwrap();
4019 tmp.flush().unwrap();
4020 assert_eq!(
4021 read_subid_range(tmp.path().to_str().unwrap(), "carol"),
4022 None
4023 );
4024 }
4025
4026 #[test]
4027 fn read_subid_range_returns_none_on_missing_file() {
4028 assert_eq!(
4029 read_subid_range("/this/path/does/not/exist/subuid", "anyone"),
4030 None
4031 );
4032 }
4033 }
4034}