zlayer_agent/error.rs
1//! Agent-specific errors
2
3use std::time::Duration;
4use thiserror::Error;
5
6/// Agent runtime errors
7#[derive(Debug, Error)]
8pub enum AgentError {
9 /// Container not found
10 #[error("Container '{container}' not found: {reason}")]
11 NotFound { container: String, reason: String },
12
13 /// Failed to pull image
14 #[error("Failed to pull image '{image}': {reason}")]
15 PullFailed { image: String, reason: String },
16
17 /// Failed to create container
18 #[error("Failed to create container '{id}': {reason}")]
19 CreateFailed { id: String, reason: String },
20
21 /// Failed to start container
22 #[error("Failed to start container '{id}': {reason}")]
23 StartFailed { id: String, reason: String },
24
25 /// Container exited unexpectedly
26 #[error("Container '{id}' exited unexpectedly with code {code}")]
27 UnexpectedExit { id: String, code: i32 },
28
29 /// Health check failed
30 #[error("Health check failed for '{id}': {reason}")]
31 HealthCheckFailed { id: String, reason: String },
32
33 /// Init action failed
34 #[error("Init action failed for '{id}': {reason}")]
35 InitActionFailed { id: String, reason: String },
36
37 /// Timeout
38 #[error("Timeout after {timeout:?}")]
39 Timeout { timeout: Duration },
40
41 /// Dependency timeout - service waiting for dependency condition
42 #[error("Dependency timeout: '{service}' waiting for '{dependency}' ({condition}) after {timeout:?}")]
43 DependencyTimeout {
44 service: String,
45 dependency: String,
46 condition: String,
47 timeout: Duration,
48 },
49
50 /// Invalid spec
51 #[error("Invalid spec: {0}")]
52 InvalidSpec(String),
53
54 /// Network setup or operation failed
55 #[error("Network error: {0}")]
56 Network(String),
57
58 /// A published host port (`127.0.0.1:<port>`) is already owned by a
59 /// different `(deployment, service)`. A host port is a global resource and
60 /// cannot be bound twice; the conflicting publish is refused rather than
61 /// silently cross-wired into the foreign backend pool.
62 #[error("Host port {port} is already published by {owner}; cannot publish it for {requester}")]
63 PortConflict {
64 port: u16,
65 owner: String,
66 requester: String,
67 },
68
69 /// Configuration error (missing or invalid configuration)
70 #[error("Configuration error: {0}")]
71 Configuration(String),
72
73 /// Internal runtime error
74 #[error("Internal error: {0}")]
75 Internal(String),
76
77 /// Operation is not supported by this runtime
78 #[error("Operation not supported by this runtime: {0}")]
79 Unsupported(String),
80
81 /// GPU was requested by the service spec, but the underlying WSL2 host
82 /// cannot deliver GPU access (typically because `/dev/dxg` is not exposed
83 /// by the running WSL2 kernel, or the `WSLg` driver shim mount is missing).
84 ///
85 /// Returned by the WSL2 delegate when wiring `/dev/dxg` and the `WSLg` lib
86 /// mounts into the youki bundle. Silent CPU fallback would be surprising
87 /// for users who explicitly asked for a GPU, so this is a hard error;
88 /// callers must either downgrade the spec to drop `resources.gpu` or
89 /// re-place the workload on a node whose WSL2 distro exposes the
90 /// `DirectX` kernel interface.
91 #[error("GPU requested but WSL2 GPU support not available on this host: {reason}")]
92 WslGpuUnavailable { reason: String },
93
94 /// GPU sharing was requested (MPS or time-slicing) but the host or
95 /// runtime cannot satisfy the requested mode.
96 ///
97 /// Typical causes:
98 /// * `mode = "mps"` but the host MPS pipe / log directory does not exist
99 /// (the `nvidia-cuda-mps-control` daemon is not running).
100 /// * `mode = "mps"` combined with `isolation: hyperv` on Windows — MPS is
101 /// not exposed inside the UVM kernel.
102 ///
103 /// Silent fallback to exclusive-mode access would be surprising for users
104 /// who explicitly opted in to sharing (they may be relying on sharing for
105 /// capacity planning), so this is a hard error. Callers must either fix
106 /// the host (start the MPS daemon, switch isolation) or drop the
107 /// `sharing` field from the spec.
108 #[error("GPU sharing mode '{mode}' is unavailable: {reason}")]
109 GpuSharingUnavailable {
110 /// Sharing mode that could not be satisfied (`"mps"`, `"time-slice"`).
111 mode: String,
112 /// Human-readable explanation (e.g. "/tmp/nvidia-mps does not exist; \
113 /// ensure nvidia-cuda-mps-control is running").
114 reason: String,
115 },
116
117 /// The workload cannot run on this node and must be re-placed on a peer
118 /// that can satisfy `required_os`.
119 ///
120 /// Returned by [`crate::runtimes::composite::CompositeRuntime::select_for`]
121 /// when a foreign-OS workload (today: Linux on a Windows node) lands on a
122 /// node that has no suitable local runtime (e.g. no WSL2 delegate
123 /// configured). The scheduler is expected to catch this and re-dispatch
124 /// to a cluster peer whose `NodeState.os` matches `required_os`. When no
125 /// capable peer exists the scheduler marks the service failed with an
126 /// actionable message naming both remediations (enable the local WSL2
127 /// delegate, or add a Linux peer to the cluster).
128 ///
129 /// This variant is *not* a container failure: the service manager must
130 /// surface it to the scheduler and must not roll up `CreateFailed` on top
131 /// of it, otherwise the rescheduling signal is lost.
132 #[error(
133 "route-to-peer: service '{service}' requires OS '{required_os}' on another node: {reason}"
134 )]
135 RouteToPeer {
136 /// Service name that needs to be re-placed.
137 service: String,
138 /// OS the workload requires (OCI-canonical: `linux` / `windows` / `darwin`).
139 required_os: String,
140 /// Human-readable explanation (e.g. "no WSL2 delegate configured on this Windows node").
141 reason: String,
142 },
143
144 /// The local runtime cannot service this image because the image's OS
145 /// does not match the runtime's expected OS.
146 ///
147 /// Returned by the HCS runtime when an image's OCI config reports
148 /// `os != "windows"` (e.g. a Linux alpine image landing on a Windows host
149 /// that also has a WSL2 delegate). Calling `vmcompute.dll!ProcessBaseImage`
150 /// on a non-Windows base layer is guaranteed to fail with
151 /// `ERROR_PATH_NOT_FOUND (0x80070003)` because the HCS API expects the
152 /// Windows-specific `Hives/` / `UtilityVM/` / `Files/Windows/System32/`
153 /// layout. Bailing early with this variant lets the composite runtime
154 /// treat the call as a soft skip (the delegate's parallel pull is the one
155 /// that actually owns the image) instead of failing the whole pull.
156 ///
157 /// This is *not* a container failure: callers in the composite layer
158 /// should distinguish this from a real `PullFailed` and continue with the
159 /// delegate's result.
160 #[error(
161 "wrong-platform: {runtime} runtime cannot handle image '{image}' (expected os={expected}, got os={actual})"
162 )]
163 WrongPlatform {
164 /// Identifier of the runtime that rejected the image (e.g. `"hcs"`,
165 /// `"wsl2"`).
166 runtime: String,
167 /// OCI-canonical OS this runtime expects (e.g. `"windows"`, `"linux"`).
168 expected: String,
169 /// OCI-canonical OS the image manifest reports.
170 actual: String,
171 /// Image reference that triggered the mismatch.
172 image: String,
173 },
174}
175
176pub type Result<T, E = AgentError> = std::result::Result<T, E>;