Skip to main content

vyre_driver/backend/
error.rs

1//! Actionable backend error taxonomy.
2
3use crate::Error;
4
5/// Machine-readable classification of a backend failure kind.
6///
7/// Use this to drive retry logic, circuit breakers, and alerting rules
8/// without parsing human-readable message strings.
9#[non_exhaustive]
10#[derive(Clone, Copy, Debug, Eq, PartialEq)]
11pub enum ErrorCode {
12    /// Backend device reported insufficient memory.
13    DeviceOutOfMemory,
14    /// The backend does not support a required feature.
15    UnsupportedFeature,
16    /// A lock used by the backend failed to unlock safely.
17    ///
18    /// This is generally caused by a panic while a write guard was held and
19    /// indicates an internal synchronization bug in process state.
20    PoisonedLock,
21    /// GPU kernel-source compilation failed. "Shader" in the variant
22    /// name is historical; the code covers any kernel-source compile
23    /// failure for any backend kernel-source or binary validation.
24    /// A 2.0 rename to `KernelCompileFailed` is tracked in the
25    /// semver-policy doc; the variant stays stable in 0.x.
26    KernelCompileFailed,
27    /// Command dispatch or queue submission failed.
28    DispatchFailed,
29    /// The program itself is invalid for this backend.
30    InvalidProgram,
31    /// A cooperative (whole-grid-sync) launch could not fit every block
32    /// co-resident on the device. This is a routable performance condition,
33    /// not a hard failure: the orchestrator should fall back (loudly) to a
34    /// recall-identical non-cooperative path (resident fixpoint or host split).
35    CooperativeResidencyExceeded,
36    /// Unclassified error (produced by [`BackendError::new`]).
37    Unknown,
38}
39
40impl ErrorCode {
41    /// Stable integer identifier for API consumers and diagnostic catalogs.
42    ///
43    /// These ids are append-only. Existing assignments must not be reused or
44    /// renumbered because downstream systems may persist them in telemetry,
45    /// alert rules, and retry policies.
46    #[must_use]
47    pub const fn stable_id(self) -> u32 {
48        match self {
49            Self::DeviceOutOfMemory => 1001,
50            Self::UnsupportedFeature => 1002,
51            Self::PoisonedLock => 1003,
52            Self::KernelCompileFailed => 1004,
53            Self::DispatchFailed => 1005,
54            Self::InvalidProgram => 1006,
55            Self::CooperativeResidencyExceeded => 1007,
56            Self::Unknown => 1999,
57        }
58    }
59}
60
61/// Actionable backend dispatch failure.
62///
63/// Every error that flows through the frozen `VyreBackend` contract must
64/// include remediation text beginning with `Fix: `. This guarantees that
65/// conform reports are directly actionable for backend authors and that
66/// consumers never receive an opaque failure string.
67///
68/// Prefer specific variants (`DeviceOutOfMemory`, `KernelCompileFailed`,
69/// etc.) over [`BackendError::new`] in new backends. The `Raw` variant
70/// exists solely for backward compatibility with existing call sites.
71///
72/// # Examples
73///
74/// ```
75/// use vyre::BackendError;
76///
77/// let err = BackendError::new("adapter not found. Fix: install a compatible device driver.");
78/// assert!(err.message().contains("Fix:"));
79/// ```
80#[non_exhaustive]
81#[derive(Clone, Debug, Eq, PartialEq, thiserror::Error)]
82pub enum BackendError {
83    /// Device ran out of memory during buffer allocation or dispatch.
84    #[error(
85        "device out of memory: requested {requested} bytes, {available} available.          Fix: reduce buffer sizes or split the dispatch into smaller chunks."
86    )]
87    DeviceOutOfMemory {
88        /// Bytes requested that triggered the OOM condition.
89        requested: u64,
90        /// Bytes reported available at the time of the failure.
91        available: u64,
92    },
93
94    /// The backend does not support a required feature.
95    #[error(
96        "unsupported feature `{name}` on backend `{backend}`.          Fix: check backend capability before using this feature, or select a backend that supports it."
97    )]
98    UnsupportedFeature {
99        /// Feature name (e.g. `"subgroup_ops"`, `"f16"`).
100        name: String,
101        /// Backend identifier (matches [`crate::backend::VyreBackend::id`]).
102        backend: String,
103    },
104
105    /// Internal lock poisoning was detected during backend synchronization.
106    #[error(
107        "backend lock poisoned: {lock_error}. Fix: report the panic origin, prevent panics on lock guards, and retry the backend operation."
108    )]
109    PoisonedLock {
110        /// Diagnostic details from the poison error.
111        lock_error: String,
112    },
113
114    /// GPU kernel-source compilation failed.
115    ///
116    /// "Shader" in the variant name is historical and generalised
117    ///  -  the code applies to any kernel-source compile failure across
118    /// backends. A 2.0 rename to
119    /// `KernelCompileFailed` is tracked in the semver-policy doc.
120    #[error(
121        "kernel-source compile failed on backend `{backend}`: {compiler_message}.          Fix: validate the vyre IR before lowering and check the lowered kernel source for type errors."
122    )]
123    KernelCompileFailed {
124        /// Backend identifier.
125        backend: String,
126        /// Compiler error text or lowered shader / IR excerpt.
127        compiler_message: String,
128    },
129
130    /// Command dispatch or GPU queue submission failed.
131    #[error(
132        "dispatch failed (code {code:?}): {message}.          Fix: verify adapter limits, buffer sizes, and GPU queue health before retrying."
133    )]
134    DispatchFailed {
135        /// Optional backend-specific numeric error code.
136        code: Option<i32>,
137        /// Human-readable failure detail.
138        message: String,
139    },
140
141    /// The program is structurally invalid for this backend.
142    #[error("{fix}")]
143    InvalidProgram {
144        /// Actionable description, should begin with `Fix: `.
145        fix: String,
146    },
147
148    /// A cooperative whole-grid launch could not be made fully resident: the
149    /// grid has more blocks than the device can co-schedule for a grid-sync
150    /// barrier. The orchestrator must fall back (loudly) to a recall-identical
151    /// non-cooperative path rather than launch a kernel that would deadlock.
152    #[error(
153        "cooperative grid-sync launch needs {grid_blocks} co-resident block(s) but the device can fit at most {resident_limit}.          Fix: route this dispatch to the resident-fixpoint or host-split grid-sync path, reduce the grid/workgroup size, or lower kernel register/shared-memory pressure. Detail: {detail}"
154    )]
155    CooperativeResidencyExceeded {
156        /// Blocks the launch geometry requires.
157        grid_blocks: u64,
158        /// Blocks the device can keep co-resident for this kernel.
159        resident_limit: u64,
160        /// Which residency bound tripped (thread vs occupancy) and the geometry.
161        detail: String,
162    },
163
164    /// Fallback for backends that have not migrated to structured errors.
165    ///
166    /// New backends should use a specific variant. This variant exists
167    /// solely to preserve backward compatibility with [`BackendError::new`].
168    #[error("{0}")]
169    Raw(String),
170}
171
172impl From<crate::Error> for BackendError {
173    fn from(error: crate::Error) -> Self {
174        Self::new(error.to_string())
175    }
176}
177
178impl BackendError {
179    /// Build a fallback [`BackendError::Raw`] after verifying the message is actionable.
180    ///
181    /// If the supplied message already contains a `Fix: ` section it is used
182    /// verbatim. Otherwise a generic fallback hint is appended. Prefer specific
183    /// variants (`DeviceOutOfMemory`, `KernelCompileFailed`, etc.) over this
184    /// constructor in new code.
185    ///
186    /// # Examples
187    ///
188    /// ```
189    /// use vyre::BackendError;
190    ///
191    /// let err = BackendError::new("queue full. Fix: retry with a smaller dispatch size.");
192    /// assert_eq!(err.to_string(), "queue full. Fix: retry with a smaller dispatch size.");
193    /// ```
194    pub fn new(message: impl Into<String>) -> Self {
195        let message = message.into();
196        if message.contains("Fix: ") {
197            return Self::Raw(message);
198        }
199        Self::Raw(format!(
200            "{message}. Fix: include backend-specific recovery guidance."
201        ))
202    }
203
204    /// Build an actionable unsupported-extension error for opaque IR payloads.
205    #[must_use]
206    pub fn unsupported_extension(
207        backend: impl Into<String>,
208        extension_kind: &str,
209        debug_identity: &str,
210    ) -> Self {
211        Self::UnsupportedFeature {
212            name: format!("opaque IR extension `{extension_kind}`/`{debug_identity}`"),
213            backend: backend.into(),
214        }
215    }
216
217    /// Build a structured lock-poisoning error.
218    ///
219    /// This constructor accepts any `PoisonError` from `RwLock` operations
220    /// and returns an actionable error carrying the root poison metadata.
221    pub fn poisoned_lock<T>(error: std::sync::PoisonError<T>) -> Self {
222        Self::PoisonedLock {
223            lock_error: error.to_string(),
224        }
225    }
226
227    /// Human-readable failure message, equivalent to [`ToString::to_string`].
228    ///
229    /// Prefer explicit `match` on variants or [`ErrorCode`] for programmatic
230    /// error handling; avoid string-parsing this output.
231    #[must_use]
232    pub fn message(&self) -> String {
233        self.to_string()
234    }
235
236    /// Consume this error and return its message string.
237    ///
238    /// Useful in `map_err` chains that expect `String`.
239    #[must_use]
240    pub fn into_message(self) -> String {
241        self.to_string()
242    }
243
244    /// Machine-readable error code for programmatic error handling.
245    ///
246    /// Use this to drive retry logic, circuit breakers, and alerting
247    /// without parsing human-readable message strings.
248    #[must_use]
249    pub fn code(&self) -> ErrorCode {
250        match self {
251            Self::DeviceOutOfMemory { .. } => ErrorCode::DeviceOutOfMemory,
252            Self::UnsupportedFeature { .. } => ErrorCode::UnsupportedFeature,
253            Self::PoisonedLock { .. } => ErrorCode::PoisonedLock,
254            Self::KernelCompileFailed { .. } => ErrorCode::KernelCompileFailed,
255            Self::DispatchFailed { .. } => ErrorCode::DispatchFailed,
256            Self::InvalidProgram { .. } => ErrorCode::InvalidProgram,
257            Self::CooperativeResidencyExceeded { .. } => ErrorCode::CooperativeResidencyExceeded,
258            Self::Raw(_) => ErrorCode::Unknown,
259        }
260    }
261}