atomr_accel/error.rs
1//! Backend-agnostic error taxonomy.
2//!
3//! Every backend's error type implements `From<AccelError>`, so
4//! generic code can return [`AccelError`] directly while
5//! backend-specific code keeps richer typed variants. The enum is
6//! `#[non_exhaustive]` — backends are free to add `LibraryError`
7//! tags (`"cublas"`, `"cudnn"`, `"hipblas"`, `"mps"`, etc.) without
8//! breaking core consumers.
9
10use thiserror::Error;
11
12pub type AccelResult<T> = Result<T, AccelError>;
13
14/// Marker prefix used in panic messages to signal a poisoned-context
15/// error. Backends panic with messages containing these tags so the
16/// supervisor decider can route to `Restart` / `Resume` / `Stop` /
17/// `Escalate` without parsing the typed enum from a panic payload.
18pub const CONTEXT_POISONED_TAG: &str = "ContextPoisoned";
19pub const OUT_OF_MEMORY_TAG: &str = "OutOfMemory";
20pub const UNRECOVERABLE_TAG: &str = "Unrecoverable";
21
22/// Typed error enum surfaced through every actor reply channel.
23///
24/// Mirrors the original `GpuError` from `atomr-accel-cuda` but lives
25/// in the backend-agnostic core. Backends wrap or re-export this as
26/// their public `Error` associated type.
27#[derive(Debug, Error)]
28#[non_exhaustive]
29pub enum AccelError {
30 /// Device context is in a sticky-error state. Triggers
31 /// `ContextActor` restart and a generation bump.
32 #[error("ContextPoisoned: {0}")]
33 ContextPoisoned(String),
34
35 /// Allocation failed but the context is still usable. Supervisor
36 /// `Resume`s the actor.
37 #[error("OutOfMemory: {0}")]
38 OutOfMemory(String),
39
40 /// Hardware fault or repeated poisoning past the retry budget.
41 #[error("Unrecoverable: {0}")]
42 Unrecoverable(String),
43
44 /// `AccelRef::access()` was called on a buffer whose context was
45 /// rebuilt or whose `DeviceActor` is shutting down.
46 #[error("AccelRef stale: {0}")]
47 AccelRefStale(&'static str),
48
49 /// Driver-level error (e.g. `cuInit`, `hipInit`, `MTLDevice`
50 /// setup) before any specific library got involved.
51 #[error("driver error: {0}")]
52 Driver(String),
53
54 /// Library error tagged with the originating component name —
55 /// e.g. `"cublas"`, `"cudnn"`, `"cufft"`, `"curand"`,
56 /// `"cusolver"`, `"cublaslt"`, `"nvrtc"`, `"nccl"`, `"hipblas"`,
57 /// `"rocfft"`, `"mps"`. Callers that need to discriminate match
58 /// on `lib`.
59 #[error("{lib} error: {msg}")]
60 LibraryError { lib: &'static str, msg: String },
61
62 #[error("ask timed out before completion")]
63 Timeout,
64}
65
66impl AccelError {
67 /// Construct a tagged library error.
68 pub fn lib(lib: &'static str, msg: impl Into<String>) -> Self {
69 Self::LibraryError {
70 lib,
71 msg: msg.into(),
72 }
73 }
74
75 /// Format suitable for panicking out of an actor handler so
76 /// that the atomr supervisor's decider can route it to a
77 /// directive based on the tagged prefix.
78 pub fn panic_message(&self) -> String {
79 self.to_string()
80 }
81}
82
83#[cfg(test)]
84mod tests {
85 use super::*;
86
87 #[test]
88 fn library_error_constructor() {
89 let e = AccelError::lib("cudnn", "create_handle failed");
90 match e {
91 AccelError::LibraryError { lib, msg } => {
92 assert_eq!(lib, "cudnn");
93 assert!(msg.contains("create_handle"));
94 }
95 _ => panic!("expected LibraryError"),
96 }
97 }
98
99 #[test]
100 fn panic_message_carries_tag() {
101 let e = AccelError::ContextPoisoned("cuInit failed".into());
102 let m = e.panic_message();
103 assert!(m.contains(CONTEXT_POISONED_TAG));
104 }
105}