Skip to main content

atomr_accel/
error.rs

1//! Backend-agnostic error taxonomy.
2//!
3//! Every backend's error type implements `From<AccelError>`, so
4//! generic code can return [`AccelError`] directly while
5//! backend-specific code keeps richer typed variants. The enum is
6//! `#[non_exhaustive]` — backends are free to add `LibraryError`
7//! tags (`"cublas"`, `"cudnn"`, `"hipblas"`, `"mps"`, etc.) without
8//! breaking core consumers.
9
10use thiserror::Error;
11
12pub type AccelResult<T> = Result<T, AccelError>;
13
14/// Marker prefix used in panic messages to signal a poisoned-context
15/// error. Backends panic with messages containing these tags so the
16/// supervisor decider can route to `Restart` / `Resume` / `Stop` /
17/// `Escalate` without parsing the typed enum from a panic payload.
18pub const CONTEXT_POISONED_TAG: &str = "ContextPoisoned";
19pub const OUT_OF_MEMORY_TAG: &str = "OutOfMemory";
20pub const UNRECOVERABLE_TAG: &str = "Unrecoverable";
21
22/// Typed error enum surfaced through every actor reply channel.
23///
24/// Mirrors the original `GpuError` from `atomr-accel-cuda` but lives
25/// in the backend-agnostic core. Backends wrap or re-export this as
26/// their public `Error` associated type.
27#[derive(Debug, Error)]
28#[non_exhaustive]
29pub enum AccelError {
30    /// Device context is in a sticky-error state. Triggers
31    /// `ContextActor` restart and a generation bump.
32    #[error("ContextPoisoned: {0}")]
33    ContextPoisoned(String),
34
35    /// Allocation failed but the context is still usable. Supervisor
36    /// `Resume`s the actor.
37    #[error("OutOfMemory: {0}")]
38    OutOfMemory(String),
39
40    /// Hardware fault or repeated poisoning past the retry budget.
41    #[error("Unrecoverable: {0}")]
42    Unrecoverable(String),
43
44    /// `AccelRef::access()` was called on a buffer whose context was
45    /// rebuilt or whose `DeviceActor` is shutting down.
46    #[error("AccelRef stale: {0}")]
47    AccelRefStale(&'static str),
48
49    /// Driver-level error (e.g. `cuInit`, `hipInit`, `MTLDevice`
50    /// setup) before any specific library got involved.
51    #[error("driver error: {0}")]
52    Driver(String),
53
54    /// Library error tagged with the originating component name —
55    /// e.g. `"cublas"`, `"cudnn"`, `"cufft"`, `"curand"`,
56    /// `"cusolver"`, `"cublaslt"`, `"nvrtc"`, `"nccl"`, `"hipblas"`,
57    /// `"rocfft"`, `"mps"`. Callers that need to discriminate match
58    /// on `lib`.
59    #[error("{lib} error: {msg}")]
60    LibraryError { lib: &'static str, msg: String },
61
62    #[error("ask timed out before completion")]
63    Timeout,
64}
65
66impl AccelError {
67    /// Construct a tagged library error.
68    pub fn lib(lib: &'static str, msg: impl Into<String>) -> Self {
69        Self::LibraryError {
70            lib,
71            msg: msg.into(),
72        }
73    }
74
75    /// Format suitable for panicking out of an actor handler so
76    /// that the atomr supervisor's decider can route it to a
77    /// directive based on the tagged prefix.
78    pub fn panic_message(&self) -> String {
79        self.to_string()
80    }
81}
82
83#[cfg(test)]
84mod tests {
85    use super::*;
86
87    #[test]
88    fn library_error_constructor() {
89        let e = AccelError::lib("cudnn", "create_handle failed");
90        match e {
91            AccelError::LibraryError { lib, msg } => {
92                assert_eq!(lib, "cudnn");
93                assert!(msg.contains("create_handle"));
94            }
95            _ => panic!("expected LibraryError"),
96        }
97    }
98
99    #[test]
100    fn panic_message_carries_tag() {
101        let e = AccelError::ContextPoisoned("cuInit failed".into());
102        let m = e.panic_message();
103        assert!(m.contains(CONTEXT_POISONED_TAG));
104    }
105}