1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
//! Agent-specific errors
use std::time::Duration;
use thiserror::Error;
/// Agent runtime errors
#[derive(Debug, Error)]
pub enum AgentError {
/// Container not found
#[error("Container '{container}' not found: {reason}")]
NotFound { container: String, reason: String },
/// Failed to pull image
#[error("Failed to pull image '{image}': {reason}")]
PullFailed { image: String, reason: String },
/// Failed to create container
#[error("Failed to create container '{id}': {reason}")]
CreateFailed { id: String, reason: String },
/// Failed to start container
#[error("Failed to start container '{id}': {reason}")]
StartFailed { id: String, reason: String },
/// Container exited unexpectedly
#[error("Container '{id}' exited unexpectedly with code {code}")]
UnexpectedExit { id: String, code: i32 },
/// Health check failed
#[error("Health check failed for '{id}': {reason}")]
HealthCheckFailed { id: String, reason: String },
/// Init action failed
#[error("Init action failed for '{id}': {reason}")]
InitActionFailed { id: String, reason: String },
/// Timeout
#[error("Timeout after {timeout:?}")]
Timeout { timeout: Duration },
/// Dependency timeout - service waiting for dependency condition
#[error("Dependency timeout: '{service}' waiting for '{dependency}' ({condition}) after {timeout:?}")]
DependencyTimeout {
service: String,
dependency: String,
condition: String,
timeout: Duration,
},
/// Invalid spec
#[error("Invalid spec: {0}")]
InvalidSpec(String),
/// Network setup or operation failed
#[error("Network error: {0}")]
Network(String),
/// Configuration error (missing or invalid configuration)
#[error("Configuration error: {0}")]
Configuration(String),
/// Internal runtime error
#[error("Internal error: {0}")]
Internal(String),
/// Operation is not supported by this runtime
#[error("Operation not supported by this runtime: {0}")]
Unsupported(String),
/// GPU was requested by the service spec, but the underlying WSL2 host
/// cannot deliver GPU access (typically because `/dev/dxg` is not exposed
/// by the running WSL2 kernel, or the `WSLg` driver shim mount is missing).
///
/// Returned by the WSL2 delegate when wiring `/dev/dxg` and the `WSLg` lib
/// mounts into the youki bundle. Silent CPU fallback would be surprising
/// for users who explicitly asked for a GPU, so this is a hard error;
/// callers must either downgrade the spec to drop `resources.gpu` or
/// re-place the workload on a node whose WSL2 distro exposes the
/// `DirectX` kernel interface.
#[error("GPU requested but WSL2 GPU support not available on this host: {reason}")]
WslGpuUnavailable { reason: String },
/// GPU sharing was requested (MPS or time-slicing) but the host or
/// runtime cannot satisfy the requested mode.
///
/// Typical causes:
/// * `mode = "mps"` but the host MPS pipe / log directory does not exist
/// (the `nvidia-cuda-mps-control` daemon is not running).
/// * `mode = "mps"` combined with `isolation: hyperv` on Windows — MPS is
/// not exposed inside the UVM kernel.
///
/// Silent fallback to exclusive-mode access would be surprising for users
/// who explicitly opted in to sharing (they may be relying on sharing for
/// capacity planning), so this is a hard error. Callers must either fix
/// the host (start the MPS daemon, switch isolation) or drop the
/// `sharing` field from the spec.
#[error("GPU sharing mode '{mode}' is unavailable: {reason}")]
GpuSharingUnavailable {
/// Sharing mode that could not be satisfied (`"mps"`, `"time-slice"`).
mode: String,
/// Human-readable explanation (e.g. "/tmp/nvidia-mps does not exist; \
/// ensure nvidia-cuda-mps-control is running").
reason: String,
},
/// The workload cannot run on this node and must be re-placed on a peer
/// that can satisfy `required_os`.
///
/// Returned by [`crate::runtimes::composite::CompositeRuntime::select_for`]
/// when a foreign-OS workload (today: Linux on a Windows node) lands on a
/// node that has no suitable local runtime (e.g. no WSL2 delegate
/// configured). The scheduler is expected to catch this and re-dispatch
/// to a cluster peer whose `NodeState.os` matches `required_os`. When no
/// capable peer exists the scheduler marks the service failed with an
/// actionable message naming both remediations (enable the local WSL2
/// delegate, or add a Linux peer to the cluster).
///
/// This variant is *not* a container failure: the service manager must
/// surface it to the scheduler and must not roll up `CreateFailed` on top
/// of it, otherwise the rescheduling signal is lost.
#[error(
"route-to-peer: service '{service}' requires OS '{required_os}' on another node: {reason}"
)]
RouteToPeer {
/// Service name that needs to be re-placed.
service: String,
/// OS the workload requires (OCI-canonical: `linux` / `windows` / `darwin`).
required_os: String,
/// Human-readable explanation (e.g. "no WSL2 delegate configured on this Windows node").
reason: String,
},
/// The local runtime cannot service this image because the image's OS
/// does not match the runtime's expected OS.
///
/// Returned by the HCS runtime when an image's OCI config reports
/// `os != "windows"` (e.g. a Linux alpine image landing on a Windows host
/// that also has a WSL2 delegate). Calling `vmcompute.dll!ProcessBaseImage`
/// on a non-Windows base layer is guaranteed to fail with
/// `ERROR_PATH_NOT_FOUND (0x80070003)` because the HCS API expects the
/// Windows-specific `Hives/` / `UtilityVM/` / `Files/Windows/System32/`
/// layout. Bailing early with this variant lets the composite runtime
/// treat the call as a soft skip (the delegate's parallel pull is the one
/// that actually owns the image) instead of failing the whole pull.
///
/// This is *not* a container failure: callers in the composite layer
/// should distinguish this from a real `PullFailed` and continue with the
/// delegate's result.
#[error(
"wrong-platform: {runtime} runtime cannot handle image '{image}' (expected os={expected}, got os={actual})"
)]
WrongPlatform {
/// Identifier of the runtime that rejected the image (e.g. `"hcs"`,
/// `"wsl2"`).
runtime: String,
/// OCI-canonical OS this runtime expects (e.g. `"windows"`, `"linux"`).
expected: String,
/// OCI-canonical OS the image manifest reports.
actual: String,
/// Image reference that triggered the mismatch.
image: String,
},
}
pub type Result<T, E = AgentError> = std::result::Result<T, E>;