vyre_driver/backend/error.rs
1//! Actionable backend error taxonomy.
2
3use crate::Error;
4
5/// Machine-readable classification of a backend failure kind.
6///
7/// Use this to drive retry logic, circuit breakers, and alerting rules
8/// without parsing human-readable message strings.
9#[non_exhaustive]
10#[derive(Clone, Copy, Debug, Eq, PartialEq)]
11pub enum ErrorCode {
12 /// Backend device reported insufficient memory.
13 DeviceOutOfMemory,
14 /// The backend does not support a required feature.
15 UnsupportedFeature,
16 /// A lock used by the backend failed to unlock safely.
17 ///
18 /// This is generally caused by a panic while a write guard was held and
19 /// indicates an internal synchronization bug in process state.
20 PoisonedLock,
21 /// GPU kernel-source compilation failed. "Shader" in the variant
22 /// name is historical; the code covers any kernel-source compile
23 /// failure for any backend kernel-source or binary validation.
24 /// A 2.0 rename to `KernelCompileFailed` is tracked in the
25 /// semver-policy doc; the variant stays stable in 0.x.
26 KernelCompileFailed,
27 /// Command dispatch or queue submission failed.
28 DispatchFailed,
29 /// The program itself is invalid for this backend.
30 InvalidProgram,
31 /// A cooperative (whole-grid-sync) launch could not fit every block
32 /// co-resident on the device. This is a routable performance condition,
33 /// not a hard failure: the orchestrator should fall back (loudly) to a
34 /// recall-identical non-cooperative path (resident fixpoint or host split).
35 CooperativeResidencyExceeded,
36 /// Unclassified error (produced by [`BackendError::new`]).
37 Unknown,
38}
39
40impl ErrorCode {
41 /// Stable integer identifier for API consumers and diagnostic catalogs.
42 ///
43 /// These ids are append-only. Existing assignments must not be reused or
44 /// renumbered because downstream systems may persist them in telemetry,
45 /// alert rules, and retry policies.
46 #[must_use]
47 pub const fn stable_id(self) -> u32 {
48 match self {
49 Self::DeviceOutOfMemory => 1001,
50 Self::UnsupportedFeature => 1002,
51 Self::PoisonedLock => 1003,
52 Self::KernelCompileFailed => 1004,
53 Self::DispatchFailed => 1005,
54 Self::InvalidProgram => 1006,
55 Self::CooperativeResidencyExceeded => 1007,
56 Self::Unknown => 1999,
57 }
58 }
59}
60
61/// Actionable backend dispatch failure.
62///
63/// Every error that flows through the frozen `VyreBackend` contract must
64/// include remediation text beginning with `Fix: `. This guarantees that
65/// conform reports are directly actionable for backend authors and that
66/// consumers never receive an opaque failure string.
67///
68/// Prefer specific variants (`DeviceOutOfMemory`, `KernelCompileFailed`,
69/// etc.) over [`BackendError::new`] in new backends. The `Raw` variant
70/// exists solely for backward compatibility with existing call sites.
71///
72/// # Examples
73///
74/// ```
75/// use vyre::BackendError;
76///
77/// let err = BackendError::new("adapter not found. Fix: install a compatible device driver.");
78/// assert!(err.message().contains("Fix:"));
79/// ```
80#[non_exhaustive]
81#[derive(Clone, Debug, Eq, PartialEq, thiserror::Error)]
82pub enum BackendError {
83 /// Device ran out of memory during buffer allocation or dispatch.
84 #[error(
85 "device out of memory: requested {requested} bytes, {available} available. Fix: reduce buffer sizes or split the dispatch into smaller chunks."
86 )]
87 DeviceOutOfMemory {
88 /// Bytes requested that triggered the OOM condition.
89 requested: u64,
90 /// Bytes reported available at the time of the failure.
91 available: u64,
92 },
93
94 /// The backend does not support a required feature.
95 #[error(
96 "unsupported feature `{name}` on backend `{backend}`. Fix: check backend capability before using this feature, or select a backend that supports it."
97 )]
98 UnsupportedFeature {
99 /// Feature name (e.g. `"subgroup_ops"`, `"f16"`).
100 name: String,
101 /// Backend identifier (matches [`crate::backend::VyreBackend::id`]).
102 backend: String,
103 },
104
105 /// Internal lock poisoning was detected during backend synchronization.
106 #[error(
107 "backend lock poisoned: {lock_error}. Fix: report the panic origin, prevent panics on lock guards, and retry the backend operation."
108 )]
109 PoisonedLock {
110 /// Diagnostic details from the poison error.
111 lock_error: String,
112 },
113
114 /// GPU kernel-source compilation failed.
115 ///
116 /// "Shader" in the variant name is historical and generalised
117 /// - the code applies to any kernel-source compile failure across
118 /// backends. A 2.0 rename to
119 /// `KernelCompileFailed` is tracked in the semver-policy doc.
120 #[error(
121 "kernel-source compile failed on backend `{backend}`: {compiler_message}. Fix: validate the vyre IR before lowering and check the lowered kernel source for type errors."
122 )]
123 KernelCompileFailed {
124 /// Backend identifier.
125 backend: String,
126 /// Compiler error text or lowered shader / IR excerpt.
127 compiler_message: String,
128 },
129
130 /// Command dispatch or GPU queue submission failed.
131 #[error(
132 "dispatch failed (code {code:?}): {message}. Fix: verify adapter limits, buffer sizes, and GPU queue health before retrying."
133 )]
134 DispatchFailed {
135 /// Optional backend-specific numeric error code.
136 code: Option<i32>,
137 /// Human-readable failure detail.
138 message: String,
139 },
140
141 /// The program is structurally invalid for this backend.
142 #[error("{fix}")]
143 InvalidProgram {
144 /// Actionable description, should begin with `Fix: `.
145 fix: String,
146 },
147
148 /// A cooperative whole-grid launch could not be made fully resident: the
149 /// grid has more blocks than the device can co-schedule for a grid-sync
150 /// barrier. The orchestrator must fall back (loudly) to a recall-identical
151 /// non-cooperative path rather than launch a kernel that would deadlock.
152 #[error(
153 "cooperative grid-sync launch needs {grid_blocks} co-resident block(s) but the device can fit at most {resident_limit}. Fix: route this dispatch to the resident-fixpoint or host-split grid-sync path, reduce the grid/workgroup size, or lower kernel register/shared-memory pressure. Detail: {detail}"
154 )]
155 CooperativeResidencyExceeded {
156 /// Blocks the launch geometry requires.
157 grid_blocks: u64,
158 /// Blocks the device can keep co-resident for this kernel.
159 resident_limit: u64,
160 /// Which residency bound tripped (thread vs occupancy) and the geometry.
161 detail: String,
162 },
163
164 /// Fallback for backends that have not migrated to structured errors.
165 ///
166 /// New backends should use a specific variant. This variant exists
167 /// solely to preserve backward compatibility with [`BackendError::new`].
168 #[error("{0}")]
169 Raw(String),
170}
171
172impl From<crate::Error> for BackendError {
173 fn from(error: crate::Error) -> Self {
174 Self::new(error.to_string())
175 }
176}
177
178impl BackendError {
179 /// Build a fallback [`BackendError::Raw`] after verifying the message is actionable.
180 ///
181 /// If the supplied message already contains a `Fix: ` section it is used
182 /// verbatim. Otherwise a generic fallback hint is appended. Prefer specific
183 /// variants (`DeviceOutOfMemory`, `KernelCompileFailed`, etc.) over this
184 /// constructor in new code.
185 ///
186 /// # Examples
187 ///
188 /// ```
189 /// use vyre::BackendError;
190 ///
191 /// let err = BackendError::new("queue full. Fix: retry with a smaller dispatch size.");
192 /// assert_eq!(err.to_string(), "queue full. Fix: retry with a smaller dispatch size.");
193 /// ```
194 pub fn new(message: impl Into<String>) -> Self {
195 let message = message.into();
196 if message.contains("Fix: ") {
197 return Self::Raw(message);
198 }
199 Self::Raw(format!(
200 "{message}. Fix: include backend-specific recovery guidance."
201 ))
202 }
203
204 /// Build an actionable unsupported-extension error for opaque IR payloads.
205 #[must_use]
206 pub fn unsupported_extension(
207 backend: impl Into<String>,
208 extension_kind: &str,
209 debug_identity: &str,
210 ) -> Self {
211 Self::UnsupportedFeature {
212 name: format!("opaque IR extension `{extension_kind}`/`{debug_identity}`"),
213 backend: backend.into(),
214 }
215 }
216
217 /// Build a structured lock-poisoning error.
218 ///
219 /// This constructor accepts any `PoisonError` from `RwLock` operations
220 /// and returns an actionable error carrying the root poison metadata.
221 pub fn poisoned_lock<T>(error: std::sync::PoisonError<T>) -> Self {
222 Self::PoisonedLock {
223 lock_error: error.to_string(),
224 }
225 }
226
227 /// Human-readable failure message, equivalent to [`ToString::to_string`].
228 ///
229 /// Prefer explicit `match` on variants or [`ErrorCode`] for programmatic
230 /// error handling; avoid string-parsing this output.
231 #[must_use]
232 pub fn message(&self) -> String {
233 self.to_string()
234 }
235
236 /// Consume this error and return its message string.
237 ///
238 /// Useful in `map_err` chains that expect `String`.
239 #[must_use]
240 pub fn into_message(self) -> String {
241 self.to_string()
242 }
243
244 /// Machine-readable error code for programmatic error handling.
245 ///
246 /// Use this to drive retry logic, circuit breakers, and alerting
247 /// without parsing human-readable message strings.
248 #[must_use]
249 pub fn code(&self) -> ErrorCode {
250 match self {
251 Self::DeviceOutOfMemory { .. } => ErrorCode::DeviceOutOfMemory,
252 Self::UnsupportedFeature { .. } => ErrorCode::UnsupportedFeature,
253 Self::PoisonedLock { .. } => ErrorCode::PoisonedLock,
254 Self::KernelCompileFailed { .. } => ErrorCode::KernelCompileFailed,
255 Self::DispatchFailed { .. } => ErrorCode::DispatchFailed,
256 Self::InvalidProgram { .. } => ErrorCode::InvalidProgram,
257 Self::CooperativeResidencyExceeded { .. } => ErrorCode::CooperativeResidencyExceeded,
258 Self::Raw(_) => ErrorCode::Unknown,
259 }
260 }
261}