Skip to main content

zlayer_agent/
lib.rs

1//! `ZLayer` Agent - Container Runtime
2//!
3//! Manages container lifecycle, health checking, init actions, and proxy integration.
4
5pub mod auth;
6pub mod autoscale_controller;
7pub mod bundle;
8pub mod capability;
9pub mod cdi;
10pub mod cgroups_stats;
11pub mod container_supervisor;
12pub mod cron_scheduler;
13pub mod dependency;
14pub mod env;
15pub mod error;
16pub mod gpu_detector;
17pub mod gpu_metrics;
18pub mod gpu_sharing;
19pub mod health;
20pub mod init;
21pub mod job;
22pub mod kv;
23pub mod metrics_providers;
24pub mod overlay_manager;
25pub mod proxy_manager;
26pub mod runtime;
27pub mod runtimes;
28pub mod service;
29pub mod stabilization;
30pub mod storage_manager;
31pub mod worker_client;
32
33#[cfg(target_os = "windows")]
34pub mod windows;
35
36pub use autoscale_controller::{has_adaptive_scaling, AutoscaleController};
37pub use bundle::*;
38pub use container_supervisor::{
39    ContainerSupervisor, SupervisedContainer, SupervisedState, SupervisorConfig, SupervisorEvent,
40};
41pub use cron_scheduler::{CronJobInfo, CronScheduler};
42pub use dependency::{
43    DependencyConditionChecker, DependencyError, DependencyGraph, DependencyNode, DependencyWaiter,
44    WaitResult,
45};
46pub use env::{
47    resolve_env_value, resolve_env_vars, resolve_env_with_secrets, EnvResolutionError, ResolvedEnv,
48};
49pub use error::*;
50pub use gpu_detector::{detect_gpus, GpuInfo};
51pub use health::*;
52pub use init::{BackoffConfig, InitOrchestrator};
53pub use job::{
54    JobExecution, JobExecutionId, JobExecutor, JobExecutorConfig, JobStatus, JobTrigger,
55};
56pub use kv::{
57    global_kv, set_global_kv, KvBackend, KvEntry, KvError, KvEvent, KvEventKind, KvStore,
58};
59pub use metrics_providers::{RuntimeStatsProvider, ServiceManagerContainerProvider};
60pub use overlay_manager::{make_interface_name, OverlayManager};
61pub use proxy_manager::{ProxyManager, ProxyManagerConfig};
62pub use runtime::*;
63pub use runtimes::{create_runtime_for_image, detect_image_artifact_type};
64
65// Youki runtime types are only available on Linux with the `youki-runtime` feature.
66#[cfg(all(target_os = "linux", feature = "youki-runtime"))]
67pub use runtimes::{YoukiConfig, YoukiRuntime};
68
69#[cfg(feature = "docker")]
70pub use runtimes::DockerRuntime;
71
72#[cfg(feature = "wasm")]
73pub use runtimes::{WasmConfig, WasmRuntime};
74
75#[cfg(target_os = "macos")]
76pub use runtimes::macos_sandbox::SandboxRuntime;
77#[cfg(target_os = "macos")]
78pub use runtimes::macos_vm::VmRuntime;
79
80pub use service::*;
81pub use stabilization::{
82    wait_for_stabilization, ServiceHealthSummary, StabilizationOutcome, StabilizationResult,
83};
84pub use storage_manager::{StorageError, StorageManager, VolumeInfo};
85pub use worker_client::{
86    WorkerClientError, WorkerClientImpl, WorkerIdentity, WorkerStatusProvider,
87};
88
89#[cfg(target_os = "macos")]
90use std::path::PathBuf;
91use std::sync::Arc;
92
93/// Configuration for macOS sandbox-based container runtime
94///
95/// Uses Apple's sandbox framework (sandbox_init/sandbox-exec) to provide
96/// process isolation on macOS. This is the preferred runtime on macOS
97/// when Docker is not available or not desired.
98#[cfg(target_os = "macos")]
99#[derive(Debug, Clone)]
100pub struct MacSandboxConfig {
101    /// Directory for container data and rootfs
102    pub data_dir: PathBuf,
103    /// Directory for container logs
104    pub log_dir: PathBuf,
105    /// Whether to enable GPU access (Metal/MPS) for containers
106    pub gpu_access: bool,
107}
108
109#[cfg(target_os = "macos")]
110impl Default for MacSandboxConfig {
111    fn default() -> Self {
112        let dirs = zlayer_paths::ZLayerDirs::system_default();
113        Self {
114            data_dir: dirs.data_dir().to_path_buf(),
115            log_dir: dirs.logs(),
116            gpu_access: true,
117        }
118    }
119}
120
121/// Configuration for selecting and configuring a container runtime
122#[derive(Debug, Clone, Default)]
123pub enum RuntimeConfig {
124    /// Automatically select the best available runtime
125    ///
126    /// Selection logic:
127    /// - On Linux: Uses bundled libcontainer runtime (no external binary needed), falls back to Docker
128    /// - On macOS: Uses sandbox runtime if available, falls back to Docker
129    /// - On Windows: Use Docker directly
130    /// - If no runtime can be initialized, returns an error
131    #[default]
132    Auto,
133    /// Use the mock runtime for testing and development
134    Mock,
135    /// Use youki/libcontainer as the container runtime (Linux only, requires the `youki-runtime` feature)
136    #[cfg(all(target_os = "linux", feature = "youki-runtime"))]
137    Youki(YoukiConfig),
138    /// Use Docker daemon as the container runtime (cross-platform)
139    #[cfg(feature = "docker")]
140    Docker,
141    /// Use WebAssembly runtime with wasmtime for WASM workloads
142    #[cfg(feature = "wasm")]
143    Wasm(WasmConfig),
144    /// Use macOS sandbox-based container runtime
145    #[cfg(target_os = "macos")]
146    MacSandbox(MacSandboxConfig),
147    /// Use macOS libkrun micro-VMs for Linux-guest isolation.
148    #[cfg(target_os = "macos")]
149    MacVm,
150    /// Use Apple `Virtualization.framework` for ephemeral native-macOS guest
151    /// VMs. Opt-in only (never `Auto`); route via `com.zlayer.isolation=vz`.
152    #[cfg(target_os = "macos")]
153    MacVz,
154    /// WSL2 backend (deprecated).
155    ///
156    /// Preserved for one release for back-compat with existing `runtime: wsl2`
157    /// configs. No real WSL2-specific backend was ever shipped — this variant
158    /// was a stub that suggested using Docker Desktop with the WSL2 backend.
159    #[cfg(target_os = "windows")]
160    #[deprecated(
161        note = "Wsl2 is deprecated in favor of Hcs (native Windows containers via the \
162                Host Compute Service). This variant is preserved for one release and \
163                currently aliases to Hcs with a default config at dispatch time."
164    )]
165    Wsl2,
166    /// Native Windows container runtime via the Host Compute Service (HCS).
167    ///
168    /// Windows-only. Drives containers directly against the Windows HCS API
169    /// (see [`crate::runtimes::hcs`]) without requiring Docker Desktop or a
170    /// WSL2 VM.
171    #[cfg(target_os = "windows")]
172    Hcs(crate::runtimes::hcs::HcsConfig),
173}
174
175/// Check if Docker daemon is available and responsive
176///
177/// This function attempts to connect to the Docker daemon using
178/// platform-specific defaults and pings it to verify connectivity.
179///
180/// # Returns
181/// `true` if Docker is available, `false` otherwise
182#[cfg(feature = "docker")]
183pub async fn is_docker_available() -> bool {
184    use bollard::Docker;
185
186    match Docker::connect_with_local_defaults() {
187        Ok(docker) => match docker.ping().await {
188            Ok(_) => {
189                tracing::debug!("Docker daemon is available");
190                true
191            }
192            Err(e) => {
193                tracing::debug!(error = %e, "Docker daemon ping failed");
194                false
195            }
196        },
197        Err(e) => {
198            tracing::debug!(error = %e, "Failed to connect to Docker daemon");
199            false
200        }
201    }
202}
203
204/// Check if Docker daemon is available (stub when docker feature is disabled)
205#[cfg(not(feature = "docker"))]
206#[allow(clippy::unused_async)]
207pub async fn is_docker_available() -> bool {
208    false
209}
210
211/// Check if the WASM runtime is available (compiled in)
212///
213/// Returns `true` if the `wasm` feature is enabled and the wasmtime
214/// runtime is compiled into this binary.
215///
216/// # Example
217///
218/// ```
219/// use zlayer_agent::is_wasm_available;
220///
221/// if is_wasm_available() {
222///     println!("WASM runtime is available");
223/// } else {
224///     println!("WASM runtime is not compiled in");
225/// }
226/// ```
227#[cfg(feature = "wasm")]
228#[must_use]
229pub fn is_wasm_available() -> bool {
230    true
231}
232
233/// Check if the WASM runtime is available (stub when wasm feature is disabled)
234#[cfg(not(feature = "wasm"))]
235#[must_use]
236pub fn is_wasm_available() -> bool {
237    false
238}
239
240/// Create a runtime based on the provided configuration
241///
242/// # Arguments
243/// * `config` - The runtime configuration specifying which runtime to use
244///
245/// # Returns
246/// An `Arc<dyn Runtime + Send + Sync>` that can be used with `ServiceManager`
247///
248/// # Errors
249/// Returns `AgentError` if the runtime cannot be initialized (e.g., failed to create
250/// required directories, no runtime available for Auto mode)
251///
252/// # Runtime Selection for Auto Mode
253///
254/// When `RuntimeConfig::Auto` is specified:
255/// - **Linux**: Uses bundled libcontainer runtime (no external binary needed), falls back to Docker
256/// - **macOS**: Uses sandbox runtime (native Metal/MPS), falls back to VM runtime (libkrun), then Docker
257/// - **Windows**: Uses Docker directly
258/// - If no runtime can be initialized, returns an error
259///
260/// # Example
261/// ```no_run
262/// use zlayer_agent::{RuntimeConfig, create_runtime};
263///
264/// # async fn example() -> Result<(), zlayer_agent::AgentError> {
265/// let runtime = create_runtime(RuntimeConfig::Auto, None).await?;
266/// # Ok(())
267/// # }
268/// ```
269#[allow(clippy::too_many_lines)]
270pub async fn create_runtime(
271    config: RuntimeConfig,
272    auth_ctx: Option<ContainerAuthContext>,
273) -> Result<Arc<dyn Runtime + Send + Sync>> {
274    match config {
275        RuntimeConfig::Auto => create_auto_runtime(auth_ctx).await,
276        RuntimeConfig::Mock => Ok(Arc::new(MockRuntime::new())),
277        #[cfg(all(target_os = "linux", feature = "youki-runtime"))]
278        RuntimeConfig::Youki(youki_config) => {
279            let runtime = YoukiRuntime::new(youki_config, auth_ctx).await?;
280            Ok(Arc::new(runtime))
281        }
282        #[cfg(feature = "docker")]
283        RuntimeConfig::Docker => {
284            let runtime = DockerRuntime::new(auth_ctx).await?;
285            Ok(Arc::new(runtime))
286        }
287        #[cfg(feature = "wasm")]
288        RuntimeConfig::Wasm(wasm_config) => {
289            let runtime = WasmRuntime::new(wasm_config, auth_ctx).await?;
290            Ok(Arc::new(runtime))
291        }
292        #[cfg(target_os = "macos")]
293        RuntimeConfig::MacSandbox(config) => {
294            let primary: Arc<dyn Runtime> = Arc::new(runtimes::macos_sandbox::SandboxRuntime::new(
295                config,
296                auth_ctx.clone(),
297            )?);
298            let delegate: Option<Arc<dyn Runtime>> = match runtimes::macos_vm::VmRuntime::new(
299                auth_ctx.clone(),
300            ) {
301                Ok(rt) => {
302                    tracing::info!(
303                            "macOS VM (libkrun) delegate available — Linux containers will execute in a micro-VM"
304                        );
305                    Some(Arc::new(rt))
306                }
307                Err(e) => {
308                    tracing::warn!(
309                        error = %e,
310                        "macOS VM delegate unavailable; node will only run mac-native containers"
311                    );
312                    None
313                }
314            };
315            // VZ Linux-guest delegate (the default Linux path on macOS). First
316            // party (no dylib), so this normally succeeds.
317            let vz_linux: Option<Arc<dyn Runtime>> =
318                runtimes::macos_vz_linux::VzLinuxRuntime::new(auth_ctx.clone())
319                    .map(|rt| Arc::new(rt) as Arc<dyn Runtime>)
320                    .ok();
321            // Opt-in VZ delegate (native-macOS guests via `com.zlayer.isolation=vz`).
322            let vz: Option<Arc<dyn Runtime>> = match runtimes::macos_vz::VzRuntime::new(auth_ctx) {
323                Ok(rt) => Some(Arc::new(rt)),
324                Err(e) => {
325                    tracing::warn!(error = %e, "macOS VZ delegate unavailable");
326                    None
327                }
328            };
329            // Point image-OS inspection at BOTH persistent blob caches the
330            // composite's `pull_image` writes into, tried in order:
331            //   1. the VZ-Linux runtime's `{data_dir}/vz/linux/images/blobs.redb`
332            //      (the delegate that actually runs the Linux workload), and
333            //   2. the primary Sandbox runtime's `{data_dir}/images/blobs.redb`.
334            // `pull_image` pulls into BOTH (primary first, then VZ-Linux), and
335            // either pull short-circuits under `IfNotPresent` when its rootfs
336            // already exists — so an already-pulled image's manifest+config may
337            // live in only ONE of the two stores. Probing both (local-only, no
338            // network per cache) lets the composite resolve a locally-cached
339            // Linux image's OS with NO network call — so the workload still
340            // routes to VZ-Linux even when Docker Hub is rate-limiting the
341            // redundant OS re-inspection.
342            let data_dir = zlayer_paths::ZLayerDirs::default_data_dir();
343            let os_inspect_cache_paths = vec![
344                data_dir
345                    .join("vz")
346                    .join("linux")
347                    .join("images")
348                    .join("blobs.redb"),
349                data_dir.join("images").join("blobs.redb"),
350            ];
351            Ok(Arc::new(
352                runtimes::composite::CompositeRuntime::new(primary, delegate)
353                    .with_vz_delegate(vz)
354                    .with_vz_linux_delegate(vz_linux)
355                    .with_os_inspect_cache_paths(os_inspect_cache_paths),
356            ))
357        }
358        #[cfg(target_os = "macos")]
359        RuntimeConfig::MacVm => Ok(Arc::new(runtimes::macos_vm::VmRuntime::new(auth_ctx)?)),
360        #[cfg(target_os = "macos")]
361        RuntimeConfig::MacVz => Ok(Arc::new(runtimes::macos_vz::VzRuntime::new(auth_ctx)?)),
362        #[cfg(target_os = "windows")]
363        #[allow(deprecated)]
364        RuntimeConfig::Wsl2 => {
365            tracing::warn!(
366                "RuntimeConfig::Wsl2 is deprecated; treating as RuntimeConfig::Hcs with default config"
367            );
368            Box::pin(create_runtime(
369                RuntimeConfig::Hcs(crate::runtimes::hcs::HcsConfig::default()),
370                auth_ctx,
371            ))
372            .await
373        }
374        #[cfg(target_os = "windows")]
375        RuntimeConfig::Hcs(hcs_config) => {
376            let primary: Arc<dyn Runtime> =
377                Arc::new(crate::runtimes::hcs::HcsRuntime::new(hcs_config).await?);
378
379            #[cfg(feature = "wsl")]
380            let delegate: Option<Arc<dyn Runtime>> =
381                match runtimes::wsl2_delegate::Wsl2DelegateRuntime::try_new().await {
382                    Ok(Some(rt)) => {
383                        tracing::info!(
384                            "WSL2 delegate runtime available — Linux containers will execute inside the zlayer distro"
385                        );
386                        Some(Arc::new(rt))
387                    }
388                    Ok(None) => {
389                        tracing::info!(
390                            "WSL2 not available; node will only run Windows-image containers"
391                        );
392                        None
393                    }
394                    Err(e) => {
395                        tracing::warn!(
396                            error = %e,
397                            "WSL2 delegate setup failed; node will only run Windows-image containers"
398                        );
399                        None
400                    }
401                };
402            #[cfg(not(feature = "wsl"))]
403            let delegate: Option<Arc<dyn Runtime>> = None;
404
405            Ok(Arc::new(runtimes::composite::CompositeRuntime::new(
406                primary, delegate,
407            )))
408        }
409    }
410}
411
412/// Automatically select and create the best available runtime
413///
414/// Selection logic:
415/// - On Linux: Uses bundled libcontainer runtime directly (no external binary needed), falls back to Docker
416/// - On macOS: `SandboxRuntime` (native Metal/MPS) → `VmRuntime` (libkrun Linux compat with GPU) → Docker
417/// - On Windows: Use Docker directly
418/// - Returns an error if no runtime can be initialized
419#[cfg_attr(
420    not(all(target_os = "linux", feature = "youki-runtime")),
421    allow(clippy::unused_async)
422)]
423#[cfg_attr(
424    not(any(
425        all(target_os = "linux", feature = "youki-runtime"),
426        target_os = "macos",
427        feature = "docker"
428    )),
429    allow(unused_variables)
430)]
431#[allow(clippy::too_many_lines)]
432async fn create_auto_runtime(
433    auth_ctx: Option<ContainerAuthContext>,
434) -> Result<Arc<dyn Runtime + Send + Sync>> {
435    tracing::info!("Auto-selecting container runtime");
436
437    // On Linux, use bundled libcontainer runtime (no daemon overhead, no external binary needed)
438    #[cfg(all(target_os = "linux", feature = "youki-runtime"))]
439    {
440        match YoukiRuntime::new(YoukiConfig::default(), auth_ctx.clone()).await {
441            Ok(runtime) => {
442                tracing::info!("Using bundled libcontainer runtime (Linux-native, no daemon)");
443                return Ok(Arc::new(runtime));
444            }
445            Err(e) => {
446                tracing::warn!(error = %e, "Failed to initialize libcontainer runtime, trying Docker");
447            }
448        }
449    }
450
451    // On macOS, build a composite runtime:
452    //   primary  = SandboxRuntime (native Metal/MPS), when available
453    //   delegate = VmRuntime (libkrun Linux compat), when available
454    // If at least the primary is available, return the composite. Otherwise
455    // (e.g. sandbox init failed), fall through to Docker.
456    #[cfg(target_os = "macos")]
457    {
458        let primary: Option<Arc<dyn Runtime>> = match runtimes::macos_sandbox::SandboxRuntime::new(
459            MacSandboxConfig::default(),
460            auth_ctx.clone(),
461        ) {
462            Ok(rt) => Some(Arc::new(rt)),
463            Err(e) => {
464                tracing::warn!("macOS sandbox runtime unavailable: {e}");
465                None
466            }
467        };
468        let delegate: Option<Arc<dyn Runtime>> = match runtimes::macos_vm::VmRuntime::new(
469            auth_ctx.clone(),
470        ) {
471            Ok(rt) => {
472                tracing::info!(
473                        "macOS VM (libkrun) delegate available — Linux containers will execute in a micro-VM"
474                    );
475                Some(Arc::new(rt))
476            }
477            Err(e) => {
478                tracing::warn!("macOS VM runtime (libkrun) unavailable: {e}");
479                None
480            }
481        };
482        // Opt-in VZ delegate (native-macOS guests via `com.zlayer.isolation=vz`);
483        // never the default, only used when a service requests it.
484        let vz: Option<Arc<dyn Runtime>> = runtimes::macos_vz::VzRuntime::new(auth_ctx.clone())
485            .map(|rt| Arc::new(rt) as Arc<dyn Runtime>)
486            .ok();
487        // VZ Linux-guest delegate — the default Linux path on macOS.
488        let vz_linux: Option<Arc<dyn Runtime>> =
489            runtimes::macos_vz_linux::VzLinuxRuntime::new(auth_ctx.clone())
490                .map(|rt| Arc::new(rt) as Arc<dyn Runtime>)
491                .ok();
492
493        if let Some(p) = primary {
494            // Point image-OS dispatch inspection at BOTH persistent blob caches
495            // the composite's `pull_image` writes into (VZ-Linux first, then the
496            // primary Sandbox store), so an already-pulled image's OS resolves
497            // LOCAL-ONLY with no network round-trip — the cached Linux image
498            // routes to VZ-Linux even when Docker Hub is rate-limiting. Mirrors
499            // the `RuntimeConfig::MacSandbox` arm above.
500            let data_dir = zlayer_paths::ZLayerDirs::default_data_dir();
501            let os_inspect_cache_paths = vec![
502                data_dir
503                    .join("vz")
504                    .join("linux")
505                    .join("images")
506                    .join("blobs.redb"),
507                data_dir.join("images").join("blobs.redb"),
508            ];
509            return Ok(Arc::new(
510                runtimes::composite::CompositeRuntime::new(p, delegate)
511                    .with_vz_delegate(vz)
512                    .with_vz_linux_delegate(vz_linux)
513                    .with_os_inspect_cache_paths(os_inspect_cache_paths),
514            ));
515        }
516        // If sandbox failed but VM succeeded, use the VM runtime on its own —
517        // it's still the best available native macOS path before falling back
518        // to Docker.
519        if let Some(d) = delegate {
520            return Ok(d);
521        }
522    }
523
524    // On Windows, build a composite runtime:
525    //   primary  = HcsRuntime (native Windows containers), when available
526    //   delegate = Wsl2DelegateRuntime (Linux containers via WSL2), when available
527    // If the primary is available, return the composite. Otherwise fall
528    // through to Docker.
529    #[cfg(target_os = "windows")]
530    {
531        let primary: Option<Arc<dyn Runtime>> =
532            match crate::runtimes::hcs::HcsRuntime::new(crate::runtimes::hcs::HcsConfig::default())
533                .await
534            {
535                Ok(rt) => {
536                    tracing::info!(
537                        "Using native Windows HCS runtime (no Docker Desktop / WSL2 required)"
538                    );
539                    Some(Arc::new(rt))
540                }
541                Err(e) => {
542                    tracing::warn!(error = %e, "HCS runtime unavailable, falling back to Docker");
543                    None
544                }
545            };
546
547        #[cfg(feature = "wsl")]
548        let delegate: Option<Arc<dyn Runtime>> =
549            match runtimes::wsl2_delegate::Wsl2DelegateRuntime::try_new().await {
550                Ok(Some(rt)) => {
551                    tracing::info!(
552                        "WSL2 delegate runtime available — Linux containers will execute inside the zlayer distro"
553                    );
554                    Some(Arc::new(rt))
555                }
556                Ok(None) => {
557                    tracing::info!(
558                        "WSL2 not available; node will only run Windows-image containers"
559                    );
560                    None
561                }
562                Err(e) => {
563                    tracing::warn!(
564                        error = %e,
565                        "WSL2 delegate setup failed; node will only run Windows-image containers"
566                    );
567                    None
568                }
569            };
570        #[cfg(not(feature = "wsl"))]
571        let delegate: Option<Arc<dyn Runtime>> = None;
572
573        if let Some(p) = primary {
574            return Ok(Arc::new(runtimes::composite::CompositeRuntime::new(
575                p, delegate,
576            )));
577        }
578    }
579
580    // On non-Linux or if libcontainer failed, try Docker
581    #[cfg(feature = "docker")]
582    {
583        if is_docker_available().await {
584            tracing::info!("Selected Docker runtime");
585            let runtime = DockerRuntime::new(auth_ctx).await?;
586            return Ok(Arc::new(runtime));
587        }
588        tracing::debug!("Docker daemon not available");
589    }
590
591    // No runtime available
592    #[cfg(all(target_os = "linux", feature = "docker"))]
593    {
594        Err(AgentError::Configuration(
595            "Bundled libcontainer runtime failed to initialize and Docker daemon is not available."
596                .to_string(),
597        ))
598    }
599
600    #[cfg(all(target_os = "linux", not(feature = "docker")))]
601    {
602        Err(AgentError::Configuration(
603            "Bundled libcontainer runtime failed to initialize. Enable the 'docker' feature for an alternative."
604                .to_string(),
605        ))
606    }
607
608    #[cfg(all(not(target_os = "linux"), feature = "docker"))]
609    {
610        Err(AgentError::Configuration(
611            "No container runtime available. Start the Docker daemon.".to_string(),
612        ))
613    }
614
615    #[cfg(all(not(target_os = "linux"), not(feature = "docker")))]
616    {
617        Err(AgentError::Configuration(
618            "No container runtime available. Enable the 'docker' feature and start the Docker daemon.".to_string(),
619        ))
620    }
621}