gam_gpu/gpu_error.rs
1//! Typed error for the `src/gpu/*` modules.
2//!
3//! Every fallible path inside the GPU layer (driver dlopen, CUDA driver
4//! API calls, cuBLAS / cuSPARSE / cuSOLVER handle lifecycle, on-device
5//! allocations and memcpys, throughput calibration) constructs one of the
6//! variants below. Module-internal `Result<_, String>` surfaces convert
7//! via `From<GpuError> for String`, which preserves the exact bytes of
8//! the prior `format!` / `to_string` payloads so logged messages are
9//! byte-equivalent to the pre-refactor strings.
10//!
11//! Only the variants actually constructed by the GPU layer are kept.
12
13/// Typed error for `src/gpu/*.rs` operations.
14#[derive(Debug, Clone)]
15pub enum GpuError {
16 /// The CUDA driver shared library (`libcuda.so` / `nvcuda.dll` /
17 /// `libcuda.dylib`) or one of its sibling stubs (cuSOLVER, cuSPARSE)
18 /// could not be loaded from any of the searched candidates.
19 DriverLibraryUnavailable { reason: String },
20 /// A required CUDA / cuBLAS / cuSOLVER / cuSPARSE symbol was missing
21 /// from a loaded library (i.e. `libloading::Library::get` returned an
22 /// error for a name we need).
23 DriverSymbolMissing { reason: String },
24 /// A CUDA driver / cuSOLVER / cuSPARSE C API call returned a non-zero
25 /// status code, or a `cudarc` safe wrapper (context bind, stream
26 /// create, cuBLAS init, alloc, memcpy, gemm, synchronize) failed.
27 DriverCallFailed { reason: String },
28 /// Runtime throughput calibration produced an unusable measurement
29 /// (non-positive elapsed time, non-finite GB/s or GFLOPS).
30 CalibrationFailed { reason: String },
31 /// No device kernel exists for the requested GPU code path on this build.
32 /// Device kernels are added opportunistically as accelerations; the absence
33 /// of one is a permanently-possible, correctly-handled condition — not a
34 /// defect — because the CPU path it falls back to is the correct reference
35 /// computation. Callers treat this as a sentinel to fall back to the CPU
36 /// path silently (no panic, no error log). Distinct from `DriverCallFailed`
37 /// so the dispatcher can tell "no kernel for this path" apart from "the
38 /// device refused". Carries a short reason for diagnostics, e.g. the kernel
39 /// name. (Any GPU acceleration roadmap belongs in an issue, not here.)
40 NoDeviceKernel { reason: String },
41}
42
43impl std::fmt::Display for GpuError {
44 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
45 match self {
46 Self::DriverLibraryUnavailable { reason }
47 | Self::DriverSymbolMissing { reason }
48 | Self::DriverCallFailed { reason }
49 | Self::CalibrationFailed { reason }
50 | Self::NoDeviceKernel { reason } => f.write_str(reason),
51 }
52 }
53}
54
55impl std::error::Error for GpuError {}
56
57impl From<GpuError> for String {
58 fn from(err: GpuError) -> String {
59 err.to_string()
60 }
61}
62
63/// Build a `GpuError::DriverCallFailed { reason: format!(...) }` value.
64///
65/// Collapses the ubiquitous
66/// `GpuError::DriverCallFailed { reason: format!("...: {err}") }`
67/// struct literal into a single call. The macro forwards every argument
68/// to `format!`, so callers retain full control over the message body,
69/// including positional / named captures and interpolation of the
70/// per-site `err` binding.
71#[macro_export]
72macro_rules! gpu_err {
73 ($($arg:tt)*) => {
74 $crate::gpu_error::GpuError::DriverCallFailed { reason: ::std::format!($($arg)*) }
75 };
76}
77
78/// `return Err(GpuError::DriverCallFailed { reason: format!(...) })`.
79///
80/// Collapses every early-return driver-call failure into a single
81/// statement. Use inside functions that return `Result<_, GpuError>`.
82#[macro_export]
83macro_rules! gpu_bail {
84 // The `gpu_err!` construction is inlined here rather than invoked as
85 // `$crate::gpu_err!`: because `lib.rs` includes this module tree via
86 // `include!`, the `#[macro_export]` `gpu_err` counts as macro-expanded,
87 // and referring to it by the absolute `$crate::` path trips a denied
88 // future-incompat lint. Keep this body in sync with `gpu_err!`.
89 ($($arg:tt)*) => {
90 return ::std::result::Result::Err(
91 $crate::gpu_error::GpuError::DriverCallFailed { reason: ::std::format!($($arg)*) },
92 )
93 };
94}
95
96/// Extension trait that attaches GPU-call context to any `Result<T, E>`
97/// whose error implements `Display`.
98///
99/// The two methods mirror the common shapes:
100/// * [`gpu_ctx`](GpuResultExt::gpu_ctx) appends `": {err}"` to a
101/// caller-supplied prefix. This is the vastly dominant shape across
102/// the GPU layer (~235 sites in the original audit).
103/// * [`gpu_ctx_with`](GpuResultExt::gpu_ctx_with) takes a closure that
104/// receives the underlying error by `&dyn Display` and returns the
105/// full reason string. Use it when the reason is not a simple
106/// `prefix: err` concatenation (e.g. multi-line, or with the error
107/// embedded mid-message).
108///
109/// **Cfg note**: The trait and its blanket impl are gated to
110/// `target_os = "linux"` so the symbol literally does not exist on
111/// non-Linux targets. Every callsite is inside a
112/// `#[cfg(target_os = "linux")]` block that wraps CUDA driver / cuBLAS /
113/// cuSOLVER calls; on non-Linux those blocks are erased and the trait
114/// would have no users. Cfg-gating the definition means a warning-fix
115/// sweep running on non-Linux cannot see "unused" callsites because the
116/// trait itself is absent — the consuming `use super::gpu_error::GpuResultExt;`
117/// imports must therefore be `#[cfg(target_os = "linux")]` to match, and
118/// that cfg-symmetry is the architectural contract that prevents the
119/// drop-the-import regression that broke the Linux build in #302.
120#[cfg(target_os = "linux")]
121pub trait GpuResultExt<T> {
122 /// Map the error to `GpuError::DriverCallFailed { reason: format!("{prefix}: {err}") }`.
123 fn gpu_ctx(self, prefix: &str) -> Result<T, GpuError>;
124
125 /// Map the error using a closure that takes the underlying error
126 /// (as `&dyn Display`) and returns the reason string.
127 fn gpu_ctx_with<F>(self, f: F) -> Result<T, GpuError>
128 where
129 F: FnOnce(&dyn std::fmt::Display) -> String;
130}
131
132#[cfg(target_os = "linux")]
133impl<T, E: std::fmt::Display> GpuResultExt<T> for Result<T, E> {
134 #[inline]
135 fn gpu_ctx(self, prefix: &str) -> Result<T, GpuError> {
136 self.map_err(|err| GpuError::DriverCallFailed {
137 reason: format!("{prefix}: {err}"),
138 })
139 }
140
141 #[inline]
142 fn gpu_ctx_with<F>(self, f: F) -> Result<T, GpuError>
143 where
144 F: FnOnce(&dyn std::fmt::Display) -> String,
145 {
146 self.map_err(|err| GpuError::DriverCallFailed { reason: f(&err) })
147 }
148}