Skip to main content

zlayer_agent/
lib.rs

1//! `ZLayer` Agent - Container Runtime
2//!
3//! Manages container lifecycle, health checking, init actions, and proxy integration.
4
5pub mod auth;
6pub mod autoscale_controller;
7pub mod bundle;
8pub mod capability;
9pub mod cdi;
10pub mod cgroups_stats;
11pub mod container_supervisor;
12pub mod cron_scheduler;
13pub mod dependency;
14pub mod env;
15pub mod error;
16pub mod gpu_detector;
17pub mod gpu_metrics;
18pub mod gpu_sharing;
19pub mod health;
20pub mod init;
21pub mod job;
22pub mod metrics_providers;
23pub mod overlay_manager;
24pub mod proxy_manager;
25pub mod runtime;
26pub mod runtimes;
27pub mod service;
28pub mod stabilization;
29pub mod storage_manager;
30pub mod worker_client;
31
32#[cfg(target_os = "windows")]
33pub mod windows;
34
35pub use autoscale_controller::{has_adaptive_scaling, AutoscaleController};
36pub use bundle::*;
37pub use container_supervisor::{
38    ContainerSupervisor, SupervisedContainer, SupervisedState, SupervisorConfig, SupervisorEvent,
39};
40pub use cron_scheduler::{CronJobInfo, CronScheduler};
41pub use dependency::{
42    DependencyConditionChecker, DependencyError, DependencyGraph, DependencyNode, DependencyWaiter,
43    WaitResult,
44};
45pub use env::{
46    resolve_env_value, resolve_env_vars, resolve_env_with_secrets, EnvResolutionError, ResolvedEnv,
47};
48pub use error::*;
49pub use gpu_detector::{detect_gpus, GpuInfo};
50pub use health::*;
51pub use init::{BackoffConfig, InitOrchestrator};
52pub use job::{
53    JobExecution, JobExecutionId, JobExecutor, JobExecutorConfig, JobStatus, JobTrigger,
54};
55pub use metrics_providers::{RuntimeStatsProvider, ServiceManagerContainerProvider};
56pub use overlay_manager::{make_interface_name, OverlayManager};
57pub use proxy_manager::{ProxyManager, ProxyManagerConfig};
58pub use runtime::*;
59pub use runtimes::{create_runtime_for_image, detect_image_artifact_type};
60
61// Youki runtime types are only available on Linux with the `youki-runtime` feature.
62#[cfg(all(target_os = "linux", feature = "youki-runtime"))]
63pub use runtimes::{YoukiConfig, YoukiRuntime};
64
65#[cfg(feature = "docker")]
66pub use runtimes::DockerRuntime;
67
68#[cfg(feature = "wasm")]
69pub use runtimes::{WasmConfig, WasmRuntime};
70
71#[cfg(target_os = "macos")]
72pub use runtimes::macos_sandbox::SandboxRuntime;
73#[cfg(target_os = "macos")]
74pub use runtimes::macos_vm::VmRuntime;
75
76pub use service::*;
77pub use stabilization::{
78    wait_for_stabilization, ServiceHealthSummary, StabilizationOutcome, StabilizationResult,
79};
80pub use storage_manager::{StorageError, StorageManager, VolumeInfo};
81pub use worker_client::{
82    WorkerClientError, WorkerClientImpl, WorkerIdentity, WorkerStatusProvider,
83};
84
85#[cfg(target_os = "macos")]
86use std::path::PathBuf;
87use std::sync::Arc;
88
89/// Configuration for macOS sandbox-based container runtime
90///
91/// Uses Apple's sandbox framework (sandbox_init/sandbox-exec) to provide
92/// process isolation on macOS. This is the preferred runtime on macOS
93/// when Docker is not available or not desired.
94#[cfg(target_os = "macos")]
95#[derive(Debug, Clone)]
96pub struct MacSandboxConfig {
97    /// Directory for container data and rootfs
98    pub data_dir: PathBuf,
99    /// Directory for container logs
100    pub log_dir: PathBuf,
101    /// Whether to enable GPU access (Metal/MPS) for containers
102    pub gpu_access: bool,
103}
104
105#[cfg(target_os = "macos")]
106impl Default for MacSandboxConfig {
107    fn default() -> Self {
108        let dirs = zlayer_paths::ZLayerDirs::system_default();
109        Self {
110            data_dir: dirs.data_dir().to_path_buf(),
111            log_dir: dirs.logs(),
112            gpu_access: true,
113        }
114    }
115}
116
117/// Configuration for selecting and configuring a container runtime
118#[derive(Debug, Clone, Default)]
119pub enum RuntimeConfig {
120    /// Automatically select the best available runtime
121    ///
122    /// Selection logic:
123    /// - On Linux: Uses bundled libcontainer runtime (no external binary needed), falls back to Docker
124    /// - On macOS: Uses sandbox runtime if available, falls back to Docker
125    /// - On Windows: Use Docker directly
126    /// - If no runtime can be initialized, returns an error
127    #[default]
128    Auto,
129    /// Use the mock runtime for testing and development
130    Mock,
131    /// Use youki/libcontainer as the container runtime (Linux only, requires the `youki-runtime` feature)
132    #[cfg(all(target_os = "linux", feature = "youki-runtime"))]
133    Youki(YoukiConfig),
134    /// Use Docker daemon as the container runtime (cross-platform)
135    #[cfg(feature = "docker")]
136    Docker,
137    /// Use WebAssembly runtime with wasmtime for WASM workloads
138    #[cfg(feature = "wasm")]
139    Wasm(WasmConfig),
140    /// Use macOS sandbox-based container runtime
141    #[cfg(target_os = "macos")]
142    MacSandbox(MacSandboxConfig),
143    /// Use macOS libkrun micro-VMs for Linux-guest isolation.
144    #[cfg(target_os = "macos")]
145    MacVm,
146    /// Use Apple `Virtualization.framework` for ephemeral native-macOS guest
147    /// VMs. Opt-in only (never `Auto`); route via `com.zlayer.isolation=vz`.
148    #[cfg(target_os = "macos")]
149    MacVz,
150    /// WSL2 backend (deprecated).
151    ///
152    /// Preserved for one release for back-compat with existing `runtime: wsl2`
153    /// configs. No real WSL2-specific backend was ever shipped — this variant
154    /// was a stub that suggested using Docker Desktop with the WSL2 backend.
155    #[cfg(target_os = "windows")]
156    #[deprecated(
157        note = "Wsl2 is deprecated in favor of Hcs (native Windows containers via the \
158                Host Compute Service). This variant is preserved for one release and \
159                currently aliases to Hcs with a default config at dispatch time."
160    )]
161    Wsl2,
162    /// Native Windows container runtime via the Host Compute Service (HCS).
163    ///
164    /// Windows-only. Drives containers directly against the Windows HCS API
165    /// (see [`crate::runtimes::hcs`]) without requiring Docker Desktop or a
166    /// WSL2 VM.
167    #[cfg(target_os = "windows")]
168    Hcs(crate::runtimes::hcs::HcsConfig),
169}
170
171/// Check if Docker daemon is available and responsive
172///
173/// This function attempts to connect to the Docker daemon using
174/// platform-specific defaults and pings it to verify connectivity.
175///
176/// # Returns
177/// `true` if Docker is available, `false` otherwise
178#[cfg(feature = "docker")]
179pub async fn is_docker_available() -> bool {
180    use bollard::Docker;
181
182    match Docker::connect_with_local_defaults() {
183        Ok(docker) => match docker.ping().await {
184            Ok(_) => {
185                tracing::debug!("Docker daemon is available");
186                true
187            }
188            Err(e) => {
189                tracing::debug!(error = %e, "Docker daemon ping failed");
190                false
191            }
192        },
193        Err(e) => {
194            tracing::debug!(error = %e, "Failed to connect to Docker daemon");
195            false
196        }
197    }
198}
199
200/// Check if Docker daemon is available (stub when docker feature is disabled)
201#[cfg(not(feature = "docker"))]
202#[allow(clippy::unused_async)]
203pub async fn is_docker_available() -> bool {
204    false
205}
206
207/// Check if the WASM runtime is available (compiled in)
208///
209/// Returns `true` if the `wasm` feature is enabled and the wasmtime
210/// runtime is compiled into this binary.
211///
212/// # Example
213///
214/// ```
215/// use zlayer_agent::is_wasm_available;
216///
217/// if is_wasm_available() {
218///     println!("WASM runtime is available");
219/// } else {
220///     println!("WASM runtime is not compiled in");
221/// }
222/// ```
223#[cfg(feature = "wasm")]
224#[must_use]
225pub fn is_wasm_available() -> bool {
226    true
227}
228
229/// Check if the WASM runtime is available (stub when wasm feature is disabled)
230#[cfg(not(feature = "wasm"))]
231#[must_use]
232pub fn is_wasm_available() -> bool {
233    false
234}
235
236/// Create a runtime based on the provided configuration
237///
238/// # Arguments
239/// * `config` - The runtime configuration specifying which runtime to use
240///
241/// # Returns
242/// An `Arc<dyn Runtime + Send + Sync>` that can be used with `ServiceManager`
243///
244/// # Errors
245/// Returns `AgentError` if the runtime cannot be initialized (e.g., failed to create
246/// required directories, no runtime available for Auto mode)
247///
248/// # Runtime Selection for Auto Mode
249///
250/// When `RuntimeConfig::Auto` is specified:
251/// - **Linux**: Uses bundled libcontainer runtime (no external binary needed), falls back to Docker
252/// - **macOS**: Uses sandbox runtime (native Metal/MPS), falls back to VM runtime (libkrun), then Docker
253/// - **Windows**: Uses Docker directly
254/// - If no runtime can be initialized, returns an error
255///
256/// # Example
257/// ```no_run
258/// use zlayer_agent::{RuntimeConfig, create_runtime};
259///
260/// # async fn example() -> Result<(), zlayer_agent::AgentError> {
261/// let runtime = create_runtime(RuntimeConfig::Auto, None).await?;
262/// # Ok(())
263/// # }
264/// ```
265#[allow(clippy::too_many_lines)]
266pub async fn create_runtime(
267    config: RuntimeConfig,
268    auth_ctx: Option<ContainerAuthContext>,
269) -> Result<Arc<dyn Runtime + Send + Sync>> {
270    match config {
271        RuntimeConfig::Auto => create_auto_runtime(auth_ctx).await,
272        RuntimeConfig::Mock => Ok(Arc::new(MockRuntime::new())),
273        #[cfg(all(target_os = "linux", feature = "youki-runtime"))]
274        RuntimeConfig::Youki(youki_config) => {
275            let runtime = YoukiRuntime::new(youki_config, auth_ctx).await?;
276            Ok(Arc::new(runtime))
277        }
278        #[cfg(feature = "docker")]
279        RuntimeConfig::Docker => {
280            let runtime = DockerRuntime::new(auth_ctx).await?;
281            Ok(Arc::new(runtime))
282        }
283        #[cfg(feature = "wasm")]
284        RuntimeConfig::Wasm(wasm_config) => {
285            let runtime = WasmRuntime::new(wasm_config, auth_ctx).await?;
286            Ok(Arc::new(runtime))
287        }
288        #[cfg(target_os = "macos")]
289        RuntimeConfig::MacSandbox(config) => {
290            let primary: Arc<dyn Runtime> = Arc::new(runtimes::macos_sandbox::SandboxRuntime::new(
291                config,
292                auth_ctx.clone(),
293            )?);
294            let delegate: Option<Arc<dyn Runtime>> = match runtimes::macos_vm::VmRuntime::new(
295                auth_ctx.clone(),
296            ) {
297                Ok(rt) => {
298                    tracing::info!(
299                            "macOS VM (libkrun) delegate available — Linux containers will execute in a micro-VM"
300                        );
301                    Some(Arc::new(rt))
302                }
303                Err(e) => {
304                    tracing::warn!(
305                        error = %e,
306                        "macOS VM delegate unavailable; node will only run mac-native containers"
307                    );
308                    None
309                }
310            };
311            // VZ Linux-guest delegate (the default Linux path on macOS). First
312            // party (no dylib), so this normally succeeds.
313            let vz_linux: Option<Arc<dyn Runtime>> =
314                runtimes::macos_vz_linux::VzLinuxRuntime::new(auth_ctx.clone())
315                    .map(|rt| Arc::new(rt) as Arc<dyn Runtime>)
316                    .ok();
317            // Opt-in VZ delegate (native-macOS guests via `com.zlayer.isolation=vz`).
318            let vz: Option<Arc<dyn Runtime>> = match runtimes::macos_vz::VzRuntime::new(auth_ctx) {
319                Ok(rt) => Some(Arc::new(rt)),
320                Err(e) => {
321                    tracing::warn!(error = %e, "macOS VZ delegate unavailable");
322                    None
323                }
324            };
325            // Point image-OS inspection at BOTH persistent blob caches the
326            // composite's `pull_image` writes into, tried in order:
327            //   1. the VZ-Linux runtime's `{data_dir}/vz/linux/images/blobs.redb`
328            //      (the delegate that actually runs the Linux workload), and
329            //   2. the primary Sandbox runtime's `{data_dir}/images/blobs.redb`.
330            // `pull_image` pulls into BOTH (primary first, then VZ-Linux), and
331            // either pull short-circuits under `IfNotPresent` when its rootfs
332            // already exists — so an already-pulled image's manifest+config may
333            // live in only ONE of the two stores. Probing both (local-only, no
334            // network per cache) lets the composite resolve a locally-cached
335            // Linux image's OS with NO network call — so the workload still
336            // routes to VZ-Linux even when Docker Hub is rate-limiting the
337            // redundant OS re-inspection.
338            let data_dir = zlayer_paths::ZLayerDirs::default_data_dir();
339            let os_inspect_cache_paths = vec![
340                data_dir
341                    .join("vz")
342                    .join("linux")
343                    .join("images")
344                    .join("blobs.redb"),
345                data_dir.join("images").join("blobs.redb"),
346            ];
347            Ok(Arc::new(
348                runtimes::composite::CompositeRuntime::new(primary, delegate)
349                    .with_vz_delegate(vz)
350                    .with_vz_linux_delegate(vz_linux)
351                    .with_os_inspect_cache_paths(os_inspect_cache_paths),
352            ))
353        }
354        #[cfg(target_os = "macos")]
355        RuntimeConfig::MacVm => Ok(Arc::new(runtimes::macos_vm::VmRuntime::new(auth_ctx)?)),
356        #[cfg(target_os = "macos")]
357        RuntimeConfig::MacVz => Ok(Arc::new(runtimes::macos_vz::VzRuntime::new(auth_ctx)?)),
358        #[cfg(target_os = "windows")]
359        #[allow(deprecated)]
360        RuntimeConfig::Wsl2 => {
361            tracing::warn!(
362                "RuntimeConfig::Wsl2 is deprecated; treating as RuntimeConfig::Hcs with default config"
363            );
364            Box::pin(create_runtime(
365                RuntimeConfig::Hcs(crate::runtimes::hcs::HcsConfig::default()),
366                auth_ctx,
367            ))
368            .await
369        }
370        #[cfg(target_os = "windows")]
371        RuntimeConfig::Hcs(hcs_config) => {
372            let primary: Arc<dyn Runtime> =
373                Arc::new(crate::runtimes::hcs::HcsRuntime::new(hcs_config).await?);
374
375            #[cfg(feature = "wsl")]
376            let delegate: Option<Arc<dyn Runtime>> =
377                match runtimes::wsl2_delegate::Wsl2DelegateRuntime::try_new().await {
378                    Ok(Some(rt)) => {
379                        tracing::info!(
380                            "WSL2 delegate runtime available — Linux containers will execute inside the zlayer distro"
381                        );
382                        Some(Arc::new(rt))
383                    }
384                    Ok(None) => {
385                        tracing::info!(
386                            "WSL2 not available; node will only run Windows-image containers"
387                        );
388                        None
389                    }
390                    Err(e) => {
391                        tracing::warn!(
392                            error = %e,
393                            "WSL2 delegate setup failed; node will only run Windows-image containers"
394                        );
395                        None
396                    }
397                };
398            #[cfg(not(feature = "wsl"))]
399            let delegate: Option<Arc<dyn Runtime>> = None;
400
401            Ok(Arc::new(runtimes::composite::CompositeRuntime::new(
402                primary, delegate,
403            )))
404        }
405    }
406}
407
408/// Automatically select and create the best available runtime
409///
410/// Selection logic:
411/// - On Linux: Uses bundled libcontainer runtime directly (no external binary needed), falls back to Docker
412/// - On macOS: `SandboxRuntime` (native Metal/MPS) → `VmRuntime` (libkrun Linux compat with GPU) → Docker
413/// - On Windows: Use Docker directly
414/// - Returns an error if no runtime can be initialized
415#[cfg_attr(
416    not(all(target_os = "linux", feature = "youki-runtime")),
417    allow(clippy::unused_async)
418)]
419#[cfg_attr(
420    not(any(
421        all(target_os = "linux", feature = "youki-runtime"),
422        target_os = "macos",
423        feature = "docker"
424    )),
425    allow(unused_variables)
426)]
427#[allow(clippy::too_many_lines)]
428async fn create_auto_runtime(
429    auth_ctx: Option<ContainerAuthContext>,
430) -> Result<Arc<dyn Runtime + Send + Sync>> {
431    tracing::info!("Auto-selecting container runtime");
432
433    // On Linux, use bundled libcontainer runtime (no daemon overhead, no external binary needed)
434    #[cfg(all(target_os = "linux", feature = "youki-runtime"))]
435    {
436        match YoukiRuntime::new(YoukiConfig::default(), auth_ctx.clone()).await {
437            Ok(runtime) => {
438                tracing::info!("Using bundled libcontainer runtime (Linux-native, no daemon)");
439                return Ok(Arc::new(runtime));
440            }
441            Err(e) => {
442                tracing::warn!(error = %e, "Failed to initialize libcontainer runtime, trying Docker");
443            }
444        }
445    }
446
447    // On macOS, build a composite runtime:
448    //   primary  = SandboxRuntime (native Metal/MPS), when available
449    //   delegate = VmRuntime (libkrun Linux compat), when available
450    // If at least the primary is available, return the composite. Otherwise
451    // (e.g. sandbox init failed), fall through to Docker.
452    #[cfg(target_os = "macos")]
453    {
454        let primary: Option<Arc<dyn Runtime>> = match runtimes::macos_sandbox::SandboxRuntime::new(
455            MacSandboxConfig::default(),
456            auth_ctx.clone(),
457        ) {
458            Ok(rt) => Some(Arc::new(rt)),
459            Err(e) => {
460                tracing::warn!("macOS sandbox runtime unavailable: {e}");
461                None
462            }
463        };
464        let delegate: Option<Arc<dyn Runtime>> = match runtimes::macos_vm::VmRuntime::new(
465            auth_ctx.clone(),
466        ) {
467            Ok(rt) => {
468                tracing::info!(
469                        "macOS VM (libkrun) delegate available — Linux containers will execute in a micro-VM"
470                    );
471                Some(Arc::new(rt))
472            }
473            Err(e) => {
474                tracing::warn!("macOS VM runtime (libkrun) unavailable: {e}");
475                None
476            }
477        };
478        // Opt-in VZ delegate (native-macOS guests via `com.zlayer.isolation=vz`);
479        // never the default, only used when a service requests it.
480        let vz: Option<Arc<dyn Runtime>> = runtimes::macos_vz::VzRuntime::new(auth_ctx.clone())
481            .map(|rt| Arc::new(rt) as Arc<dyn Runtime>)
482            .ok();
483        // VZ Linux-guest delegate — the default Linux path on macOS.
484        let vz_linux: Option<Arc<dyn Runtime>> =
485            runtimes::macos_vz_linux::VzLinuxRuntime::new(auth_ctx.clone())
486                .map(|rt| Arc::new(rt) as Arc<dyn Runtime>)
487                .ok();
488
489        if let Some(p) = primary {
490            // Point image-OS dispatch inspection at BOTH persistent blob caches
491            // the composite's `pull_image` writes into (VZ-Linux first, then the
492            // primary Sandbox store), so an already-pulled image's OS resolves
493            // LOCAL-ONLY with no network round-trip — the cached Linux image
494            // routes to VZ-Linux even when Docker Hub is rate-limiting. Mirrors
495            // the `RuntimeConfig::MacSandbox` arm above.
496            let data_dir = zlayer_paths::ZLayerDirs::default_data_dir();
497            let os_inspect_cache_paths = vec![
498                data_dir
499                    .join("vz")
500                    .join("linux")
501                    .join("images")
502                    .join("blobs.redb"),
503                data_dir.join("images").join("blobs.redb"),
504            ];
505            return Ok(Arc::new(
506                runtimes::composite::CompositeRuntime::new(p, delegate)
507                    .with_vz_delegate(vz)
508                    .with_vz_linux_delegate(vz_linux)
509                    .with_os_inspect_cache_paths(os_inspect_cache_paths),
510            ));
511        }
512        // If sandbox failed but VM succeeded, use the VM runtime on its own —
513        // it's still the best available native macOS path before falling back
514        // to Docker.
515        if let Some(d) = delegate {
516            return Ok(d);
517        }
518    }
519
520    // On Windows, build a composite runtime:
521    //   primary  = HcsRuntime (native Windows containers), when available
522    //   delegate = Wsl2DelegateRuntime (Linux containers via WSL2), when available
523    // If the primary is available, return the composite. Otherwise fall
524    // through to Docker.
525    #[cfg(target_os = "windows")]
526    {
527        let primary: Option<Arc<dyn Runtime>> =
528            match crate::runtimes::hcs::HcsRuntime::new(crate::runtimes::hcs::HcsConfig::default())
529                .await
530            {
531                Ok(rt) => {
532                    tracing::info!(
533                        "Using native Windows HCS runtime (no Docker Desktop / WSL2 required)"
534                    );
535                    Some(Arc::new(rt))
536                }
537                Err(e) => {
538                    tracing::warn!(error = %e, "HCS runtime unavailable, falling back to Docker");
539                    None
540                }
541            };
542
543        #[cfg(feature = "wsl")]
544        let delegate: Option<Arc<dyn Runtime>> =
545            match runtimes::wsl2_delegate::Wsl2DelegateRuntime::try_new().await {
546                Ok(Some(rt)) => {
547                    tracing::info!(
548                        "WSL2 delegate runtime available — Linux containers will execute inside the zlayer distro"
549                    );
550                    Some(Arc::new(rt))
551                }
552                Ok(None) => {
553                    tracing::info!(
554                        "WSL2 not available; node will only run Windows-image containers"
555                    );
556                    None
557                }
558                Err(e) => {
559                    tracing::warn!(
560                        error = %e,
561                        "WSL2 delegate setup failed; node will only run Windows-image containers"
562                    );
563                    None
564                }
565            };
566        #[cfg(not(feature = "wsl"))]
567        let delegate: Option<Arc<dyn Runtime>> = None;
568
569        if let Some(p) = primary {
570            return Ok(Arc::new(runtimes::composite::CompositeRuntime::new(
571                p, delegate,
572            )));
573        }
574    }
575
576    // On non-Linux or if libcontainer failed, try Docker
577    #[cfg(feature = "docker")]
578    {
579        if is_docker_available().await {
580            tracing::info!("Selected Docker runtime");
581            let runtime = DockerRuntime::new(auth_ctx).await?;
582            return Ok(Arc::new(runtime));
583        }
584        tracing::debug!("Docker daemon not available");
585    }
586
587    // No runtime available
588    #[cfg(all(target_os = "linux", feature = "docker"))]
589    {
590        Err(AgentError::Configuration(
591            "Bundled libcontainer runtime failed to initialize and Docker daemon is not available."
592                .to_string(),
593        ))
594    }
595
596    #[cfg(all(target_os = "linux", not(feature = "docker")))]
597    {
598        Err(AgentError::Configuration(
599            "Bundled libcontainer runtime failed to initialize. Enable the 'docker' feature for an alternative."
600                .to_string(),
601        ))
602    }
603
604    #[cfg(all(not(target_os = "linux"), feature = "docker"))]
605    {
606        Err(AgentError::Configuration(
607            "No container runtime available. Start the Docker daemon.".to_string(),
608        ))
609    }
610
611    #[cfg(all(not(target_os = "linux"), not(feature = "docker")))]
612    {
613        Err(AgentError::Configuration(
614            "No container runtime available. Enable the 'docker' feature and start the Docker daemon.".to_string(),
615        ))
616    }
617}