Skip to main content

zlayer_agent/
error.rs

1//! Agent-specific errors
2
3use std::time::Duration;
4use thiserror::Error;
5
6/// Agent runtime errors
7#[derive(Debug, Error)]
8pub enum AgentError {
9    /// Container not found
10    #[error("Container '{container}' not found: {reason}")]
11    NotFound { container: String, reason: String },
12
13    /// Failed to pull image
14    #[error("Failed to pull image '{image}': {reason}")]
15    PullFailed { image: String, reason: String },
16
17    /// Failed to create container
18    #[error("Failed to create container '{id}': {reason}")]
19    CreateFailed { id: String, reason: String },
20
21    /// Failed to start container
22    #[error("Failed to start container '{id}': {reason}")]
23    StartFailed { id: String, reason: String },
24
25    /// Container exited unexpectedly
26    #[error("Container '{id}' exited unexpectedly with code {code}")]
27    UnexpectedExit { id: String, code: i32 },
28
29    /// Health check failed
30    #[error("Health check failed for '{id}': {reason}")]
31    HealthCheckFailed { id: String, reason: String },
32
33    /// Init action failed
34    #[error("Init action failed for '{id}': {reason}")]
35    InitActionFailed { id: String, reason: String },
36
37    /// Timeout
38    #[error("Timeout after {timeout:?}")]
39    Timeout { timeout: Duration },
40
41    /// Dependency timeout - service waiting for dependency condition
42    #[error("Dependency timeout: '{service}' waiting for '{dependency}' ({condition}) after {timeout:?}")]
43    DependencyTimeout {
44        service: String,
45        dependency: String,
46        condition: String,
47        timeout: Duration,
48    },
49
50    /// Invalid spec
51    #[error("Invalid spec: {0}")]
52    InvalidSpec(String),
53
54    /// A host-source bind mount points at a path that does not exist on disk.
55    ///
56    /// Returned by the OCI bundle builder *before* the spec is handed to
57    /// libcontainer. libcontainer canonicalizes the source path of every bind
58    /// mount during rootfs preparation; a missing source makes the whole
59    /// container start die with the opaque `failed to prepare rootfs` error and
60    /// no indication of *which* mount is at fault. Validating up front turns
61    /// that into an actionable daemon error naming both the missing host
62    /// `source` and the in-container `dest`.
63    ///
64    /// This covers socket mounts, CDI/GPU driver-library mounts, and storage
65    /// volume binds. The daemon-generated `/etc/resolv.conf` convenience mount
66    /// is deliberately *not* surfaced through this variant: if its source is
67    /// missing the builder warns and skips it (DNS injection is best-effort and
68    /// must never block a start).
69    #[error("bind mount source {src_path} does not exist (for container mount {dest})")]
70    MountSourceMissing {
71        /// Host filesystem path that was expected to exist (the bind source).
72        ///
73        /// Deliberately *not* named `source`: thiserror reserves a field named
74        /// `source` for the `Error::source()` cause chain and would require it
75        /// to implement `std::error::Error`, which a plain `String` does not.
76        src_path: String,
77        /// In-container destination path the source was to be mounted at.
78        dest: String,
79    },
80
81    /// Network setup or operation failed
82    #[error("Network error: {0}")]
83    Network(String),
84
85    /// A published host port (`127.0.0.1:<port>`) is already owned by a
86    /// different `(deployment, service)`. A host port is a global resource and
87    /// cannot be bound twice; the conflicting publish is refused rather than
88    /// silently cross-wired into the foreign backend pool.
89    #[error("Host port {port} is already published by {owner}; cannot publish it for {requester}")]
90    PortConflict {
91        port: u16,
92        owner: String,
93        requester: String,
94    },
95
96    /// Configuration error (missing or invalid configuration)
97    #[error("Configuration error: {0}")]
98    Configuration(String),
99
100    /// Internal runtime error
101    #[error("Internal error: {0}")]
102    Internal(String),
103
104    /// Operation is not supported by this runtime
105    #[error("Operation not supported by this runtime: {0}")]
106    Unsupported(String),
107
108    /// GPU was requested by the service spec, but the underlying WSL2 host
109    /// cannot deliver GPU access (typically because `/dev/dxg` is not exposed
110    /// by the running WSL2 kernel, or the `WSLg` driver shim mount is missing).
111    ///
112    /// Returned by the WSL2 delegate when wiring `/dev/dxg` and the `WSLg` lib
113    /// mounts into the youki bundle. Silent CPU fallback would be surprising
114    /// for users who explicitly asked for a GPU, so this is a hard error;
115    /// callers must either downgrade the spec to drop `resources.gpu` or
116    /// re-place the workload on a node whose WSL2 distro exposes the
117    /// `DirectX` kernel interface.
118    #[error("GPU requested but WSL2 GPU support not available on this host: {reason}")]
119    WslGpuUnavailable { reason: String },
120
121    /// GPU sharing was requested (MPS or time-slicing) but the host or
122    /// runtime cannot satisfy the requested mode.
123    ///
124    /// Typical causes:
125    /// * `mode = "mps"` but the host MPS pipe / log directory does not exist
126    ///   (the `nvidia-cuda-mps-control` daemon is not running).
127    /// * `mode = "mps"` combined with `isolation: hyperv` on Windows — MPS is
128    ///   not exposed inside the UVM kernel.
129    ///
130    /// Silent fallback to exclusive-mode access would be surprising for users
131    /// who explicitly opted in to sharing (they may be relying on sharing for
132    /// capacity planning), so this is a hard error. Callers must either fix
133    /// the host (start the MPS daemon, switch isolation) or drop the
134    /// `sharing` field from the spec.
135    #[error("GPU sharing mode '{mode}' is unavailable: {reason}")]
136    GpuSharingUnavailable {
137        /// Sharing mode that could not be satisfied (`"mps"`, `"time-slice"`).
138        mode: String,
139        /// Human-readable explanation (e.g. "/tmp/nvidia-mps does not exist; \
140        /// ensure nvidia-cuda-mps-control is running").
141        reason: String,
142    },
143
144    /// The workload cannot run on this node and must be re-placed on a peer
145    /// that can satisfy `required_os`.
146    ///
147    /// Returned by [`crate::runtimes::composite::CompositeRuntime::select_for`]
148    /// when a foreign-OS workload (today: Linux on a Windows node) lands on a
149    /// node that has no suitable local runtime (e.g. no WSL2 delegate
150    /// configured). The scheduler is expected to catch this and re-dispatch
151    /// to a cluster peer whose `NodeState.os` matches `required_os`. When no
152    /// capable peer exists the scheduler marks the service failed with an
153    /// actionable message naming both remediations (enable the local WSL2
154    /// delegate, or add a Linux peer to the cluster).
155    ///
156    /// This variant is *not* a container failure: the service manager must
157    /// surface it to the scheduler and must not roll up `CreateFailed` on top
158    /// of it, otherwise the rescheduling signal is lost.
159    #[error(
160        "route-to-peer: service '{service}' requires OS '{required_os}' on another node: {reason}"
161    )]
162    RouteToPeer {
163        /// Service name that needs to be re-placed.
164        service: String,
165        /// OS the workload requires (OCI-canonical: `linux` / `windows` / `darwin`).
166        required_os: String,
167        /// Human-readable explanation (e.g. "no WSL2 delegate configured on this Windows node").
168        reason: String,
169    },
170
171    /// The local runtime cannot service this image because the image's OS
172    /// does not match the runtime's expected OS.
173    ///
174    /// Returned by the HCS runtime when an image's OCI config reports
175    /// `os != "windows"` (e.g. a Linux alpine image landing on a Windows host
176    /// that also has a WSL2 delegate). Calling `vmcompute.dll!ProcessBaseImage`
177    /// on a non-Windows base layer is guaranteed to fail with
178    /// `ERROR_PATH_NOT_FOUND (0x80070003)` because the HCS API expects the
179    /// Windows-specific `Hives/` / `UtilityVM/` / `Files/Windows/System32/`
180    /// layout. Bailing early with this variant lets the composite runtime
181    /// treat the call as a soft skip (the delegate's parallel pull is the one
182    /// that actually owns the image) instead of failing the whole pull.
183    ///
184    /// This is *not* a container failure: callers in the composite layer
185    /// should distinguish this from a real `PullFailed` and continue with the
186    /// delegate's result.
187    #[error(
188        "wrong-platform: {runtime} runtime cannot handle image '{image}' (expected os={expected}, got os={actual})"
189    )]
190    WrongPlatform {
191        /// Identifier of the runtime that rejected the image (e.g. `"hcs"`,
192        /// `"wsl2"`).
193        runtime: String,
194        /// OCI-canonical OS this runtime expects (e.g. `"windows"`, `"linux"`).
195        expected: String,
196        /// OCI-canonical OS the image manifest reports.
197        actual: String,
198        /// Image reference that triggered the mismatch.
199        image: String,
200    },
201}
202
203pub type Result<T, E = AgentError> = std::result::Result<T, E>;