gam_gpu/device_runtime.rs
1#[cfg(target_os = "linux")]
2use std::collections::HashMap;
3#[cfg(target_os = "linux")]
4use std::panic::{self, AssertUnwindSafe, catch_unwind};
5use std::sync::OnceLock;
6#[cfg(target_os = "linux")]
7use std::sync::{Arc, Mutex};
8
9use super::device::GpuDeviceInfo;
10use super::gpu_error::GpuError;
11use super::policy::GpuDispatchPolicy;
12#[cfg(target_os = "linux")]
13use cudarc::driver::{CudaContext, result, sys};
14
15#[path = "runtime_diagnostics.rs"]
16pub(crate) mod diagnostics;
17
18#[derive(Clone, Debug)]
19#[must_use]
20pub struct GpuRuntime {
21 /// Highest-scoring probed CUDA device. Existing dispatch code routes
22 /// one-shot kernels through this device.
23 pub device: GpuDeviceInfo,
24 /// All usable CUDA devices discovered at probe time, ordered by score.
25 pub devices: Vec<GpuDeviceInfo>,
26 pub policy: GpuDispatchPolicy,
27 pub memory_budget_bytes: usize,
28}
29
30static CPU_REASON: OnceLock<String> = OnceLock::new();
31
32/// Install a process-wide panic hook (idempotent) that drops cudarc's
33/// `panic_no_lib_found` message instead of writing it to stderr. All other
34/// panics flow to the previously installed hook unchanged. The site cudarc
35/// 0.19 panics from is `cudarc-0.19.7/src/lib.rs:200` inside its dynamic
36/// loader; messages from that path start with `Unable to dynamically load`.
37/// Caller code wraps the same cudarc entry points in `catch_unwind`, so the
38/// panic is recovered — this hook just prevents the stderr noise that made
39/// operators think the fit had crashed.
40#[cfg(target_os = "linux")]
41fn install_cudarc_panic_filter() {
42 static HOOK_INSTALLED: OnceLock<()> = OnceLock::new();
43 HOOK_INSTALLED.get_or_init(|| {
44 let prior = panic::take_hook();
45 panic::set_hook(Box::new(move |info| {
46 let payload = info.payload();
47 let message = payload
48 .downcast_ref::<&'static str>()
49 .copied()
50 .or_else(|| payload.downcast_ref::<String>().map(String::as_str))
51 .unwrap_or("");
52 if message.starts_with("Unable to dynamically load") {
53 return;
54 }
55 prior(info);
56 }));
57 });
58}
59
60impl GpuRuntime {
61 pub fn probe() -> Result<Option<Self>, GpuError> {
62 if super::global_policy() == super::GpuPolicy::Off {
63 Self::record_cpu_reason("GPU policy is off");
64 diagnostics::log_cuda_disabled("GPU policy is off");
65 return Ok(None);
66 }
67
68 #[cfg(not(target_os = "linux"))]
69 {
70 let reason = "CUDA support not compiled into this build";
71 Self::record_cpu_reason(reason);
72 diagnostics::log_cuda_disabled(reason);
73 return Err(GpuError::DriverLibraryUnavailable {
74 reason: reason.to_string(),
75 });
76 }
77
78 #[cfg(target_os = "linux")]
79 {
80 // `cudarc 0.19`'s entry points lazily initialize the CUDA driver
81 // through generated `culib()` helpers. On CPU-only Linux hosts the
82 // first such call emits `panic_no_lib_found` before unwinding, which
83 // polluted large-scale logs even when the panic was later caught and the
84 // fit fell back to CPU. Keep the preflight completely outside
85 // cudarc: use gam's own `libloading` probe first, and only touch
86 // cudarc after the platform loader can open `libcuda`.
87 //
88 // The preflight does not always agree with cudarc's own loader
89 // candidate list (e.g. large-scale workbench images expose CUDA *runtime*
90 // stub libraries under `/usr/local/cuda-*/targets/.../lib` but no
91 // driver `libcuda.so` in any loader path), so we additionally
92 // install a panic-hook filter that suppresses cudarc's
93 // `panic_no_lib_found` message and wrap every cudarc entry point
94 // below in `catch_unwind` to convert the panic into a typed
95 // `GpuError::DriverCallFailed` instead.
96 install_cudarc_panic_filter();
97 // #1017 probe-first fix: establish cudarc's primary context P and
98 // initialize the CUDA runtime ON IT as the VERY FIRST CUDA action -- before
99 // gam's libloading libcuda preload, the compute-lib dlopens, and device_count.
100 // The clean cuda_context_for-first path works; the probe-first path failed
101 // because a pre-context CUDA touch left the runtime bound to a non-P context,
102 // so later cuBLAS/cuSOLVER handle creation on the P-stream returned
103 // NOT_INITIALIZED. Making cuda_context_for the first action replicates the
104 // working clean path (CudaContext::new loads libcuda + retains the primary +
105 // ensure runs the runtime init); on a CPU-only host it returns None cleanly
106 // via the panic filter + catch_unwind, and the preload check below still runs.
107 let primary_ready = cuda_context_for(0).is_some();
108 log::trace!("[GPU] probe pre-init primary context + runtime: {primary_ready}");
109 if crate::driver::preload_cuda_driver().is_err() {
110 let reason = "libcuda unavailable";
111 Self::record_cpu_reason(reason);
112 log::info!("[GPU] CUDA acceleration disabled: {reason}");
113 diagnostics::log_cuda_disabled(reason);
114 return Err(GpuError::DriverLibraryUnavailable {
115 reason: reason.to_string(),
116 });
117 }
118
119 // Driver-only environments (e.g. large-scale workbench images that expose
120 // `libcuda.so.1` but ship no cuBLAS/cuSOLVER/cuSPARSE) used to slip
121 // past the libcuda preflight, enable the runtime, and then panic
122 // out of cudarc's `panic_no_lib_found` on the first `CudaBlas::new`
123 // — the panic crossed the PyO3 FFI boundary as a
124 // `ValueError: fit_table panicked inside Rust boundary: Unable to
125 // dynamically load the "cublas" shared library`. The compute
126 // libraries are dispatch-critical (every cuBLAS / cuSOLVER /
127 // cuSPARSE site under `src/gpu/` calls `CudaBlas::new` /
128 // `DnHandle::new` / cusparse handle creation eagerly during
129 // workspace allocation), so we refuse to advertise GPU unless all
130 // three load cleanly here.
131 for stem in ["cublas", "cusolver", "cusparse"] {
132 if !crate::driver::cuda_compute_library_present(stem) {
133 let reason = format!("lib{stem} unavailable");
134 Self::record_cpu_reason(reason.clone());
135 log::info!("[GPU] CUDA acceleration disabled: {reason}");
136 diagnostics::log_cuda_disabled(&reason);
137 return Err(GpuError::DriverLibraryUnavailable { reason });
138 }
139 }
140
141 // cudarc 0.19's `culib()` panics via `panic_no_lib_found` when its
142 // own (separate from gam's) dynamic-loader candidate list cannot
143 // find libcuda — this can happen even after our `preload_cuda_driver`
144 // succeeds, for example if our probe loaded a CUDA stub library but
145 // cudarc's loader searches a disjoint set of names. Convert any such
146 // panic into a typed probe failure so the runtime cleanly disables
147 // CUDA and the CPU fallback proceeds without alarming stderr noise.
148 let device_count = catch_unwind(AssertUnwindSafe(CudaContext::device_count))
149 .map_err(|_| GpuError::DriverLibraryUnavailable {
150 reason: "libcuda unavailable".to_string(),
151 })?
152 .map_err(|err| GpuError::DriverCallFailed {
153 reason: err.to_string(),
154 })?;
155 if device_count <= 0 {
156 let reason = "CUDA driver reported no devices";
157 Self::record_cpu_reason(reason);
158 diagnostics::log_cuda_disabled(reason);
159 // Surface the no-device state as a structured `DriverCallFailed`
160 // so callers wanting a CPU-reason marker can distinguish
161 // "policy off" (Ok(None)) from "driver present but no usable
162 // hardware" (Err). This keeps `GpuRuntime::probe()` honest: a
163 // successful `Ok` always carries at least one device.
164 return Err(GpuError::DriverCallFailed {
165 reason: reason.to_string(),
166 });
167 }
168
169 let mut devices = Vec::new();
170 for ordinal in
171 0..usize::try_from(device_count).map_err(|_| GpuError::DriverCallFailed {
172 reason: "negative CUDA device count".into(),
173 })?
174 {
175 let ctx = cuda_context_for(ordinal).ok_or_else(|| {
176 gpu_err!("failed to create CUDA context for device {ordinal}")
177 })?;
178 catch_unwind(AssertUnwindSafe(|| ctx.bind_to_thread()))
179 .map_err(|_| GpuError::DriverLibraryUnavailable {
180 reason: "libcuda unavailable".to_string(),
181 })?
182 .map_err(|err| GpuError::DriverCallFailed {
183 reason: err.to_string(),
184 })?;
185 devices.push(
186 catch_unwind(AssertUnwindSafe(|| cuda_device_info(ordinal, &ctx))).map_err(
187 |_| GpuError::DriverLibraryUnavailable {
188 reason: "libcuda unavailable".to_string(),
189 },
190 )??,
191 );
192 }
193
194 devices.sort_by(|a, b| b.score().total_cmp(&a.score()));
195 let Some(device) = devices.first().cloned() else {
196 Self::record_cpu_reason("CUDA driver reported no usable devices");
197 diagnostics::log_cuda_disabled("CUDA driver reported no usable devices");
198 return Ok(None);
199 };
200
201 let policy = crate::calibration::calibrated_policy_for_device(&device);
202 let memory_budget_bytes = device.memory_budget_bytes();
203 diagnostics::log_cuda_enabled(&device, &policy);
204 diagnostics::log_cuda_pool(&devices);
205
206 Ok(Some(Self {
207 device,
208 devices,
209 policy,
210 memory_budget_bytes,
211 }))
212 }
213 }
214
215 #[must_use]
216 pub fn global() -> Option<&'static Self> {
217 static RUNTIME: OnceLock<Option<GpuRuntime>> = OnceLock::new();
218 RUNTIME
219 .get_or_init(|| match Self::probe() {
220 Ok(runtime) => runtime,
221 Err(err) => {
222 let reason = err.to_string();
223 Self::record_cpu_reason(reason.clone());
224 diagnostics::log_cuda_disabled(&reason);
225 None
226 }
227 })
228 .as_ref()
229 }
230
231 #[must_use]
232 pub fn is_available() -> bool {
233 Self::global().is_some()
234 }
235
236 /// Fail-closed accessor for the process-wide runtime under a [`GpuMode`]
237 /// contract (issue #1017).
238 ///
239 /// * [`GpuMode::Required`] — the device MUST be present: when the probe
240 /// found no usable runtime this returns `Err(GpuError::DriverLibraryUnavailable)`
241 /// carrying the recorded CPU reason, so the resident path surfaces a
242 /// structured error instead of silently falling back to the CPU.
243 /// * [`GpuMode::Auto`] / [`GpuMode::Off`] — preserve the existing
244 /// probe-first behavior bit-for-bit: this is a thin wrapper over
245 /// [`Self::global`] that maps the `None` case to the same typed error
246 /// without ever forcing the runtime on or changing any numerics. `Auto`
247 /// callers treat the `Err` exactly as they treated `global().is_none()`
248 /// today (fall back to CPU); only the `Required` caller propagates it.
249 ///
250 /// This does NOT alter `global()`/`cuda_context_for`/`ensure_cuda_runtime_device`;
251 /// it only adds the residency gate on top of the working Auto path.
252 pub fn global_or_fail(mode: super::GpuMode) -> Result<&'static Self, GpuError> {
253 match mode {
254 super::GpuMode::Off => Err(GpuError::DriverLibraryUnavailable {
255 reason: "GPU residency mode is off".to_string(),
256 }),
257 super::GpuMode::Auto | super::GpuMode::Required => {
258 Self::global().ok_or_else(|| GpuError::DriverLibraryUnavailable {
259 reason: Self::cpu_reason()
260 .unwrap_or("CUDA runtime unavailable")
261 .to_string(),
262 })
263 }
264 }
265 }
266
267 #[must_use]
268 pub fn policy(&self) -> &GpuDispatchPolicy {
269 &self.policy
270 }
271
272 #[must_use]
273 pub fn selected_device(&self) -> &GpuDeviceInfo {
274 &self.device
275 }
276
277 #[must_use]
278 pub(crate) fn cpu_reason() -> Option<&'static str> {
279 CPU_REASON.get().map(String::as_str)
280 }
281
282 fn record_cpu_reason(reason: impl Into<String>) {
283 CPU_REASON.set(reason.into()).ok();
284 }
285}
286
287/// Make the CUDA **runtime** API usable on `ordinal`.
288///
289/// gam drives the GPU through the CUDA *driver* API (cudarc [`CudaContext`]),
290/// which materialises the driver primary context but never selects a device for
291/// the CUDA *runtime* API. cuBLAS / cuSOLVER are runtime-based, so `cublasCreate`
292/// / `cusolverDnCreate` return `CUBLAS_STATUS_NOT_INITIALIZED` /
293/// `CUSOLVER_STATUS_NOT_INITIALIZED` until the runtime has a current device —
294/// which silently disables *every* GPU linear-algebra path (the dispatch sites
295/// map the handle error to `Unavailable` and fall back to CPU). We select the
296/// device on the calling host thread (cheap, idempotent) and force one-time
297/// runtime primary-context materialisation per device via the canonical
298/// `cudaMalloc`/`cudaFree` idiom, so every downstream handle creation succeeds.
299#[cfg(target_os = "linux")]
300fn ensure_cuda_runtime_device(ordinal: usize) {
301 let Ok(o) = i32::try_from(ordinal) else {
302 return;
303 };
304 // SAFETY: the `runtime` cudarc feature is enabled; cudaSetDevice on a valid
305 // ordinal is idempotent and per-host-thread.
306 let set_rc = unsafe { cudarc::runtime::sys::cudaSetDevice(o) };
307 log::trace!("[GPU] runtime cudaSetDevice({o}) -> {set_rc:?}");
308 // Materialise the runtime primary context on EVERY call (not once): the driver
309 // probe binds cudarc's own context, and cuBLAS/cuSOLVER `*Create` use whatever
310 // context is current at creation time, so the runtime device must be reselected
311 // and its primary context re-materialised immediately before each handle is made.
312 // A 256-byte allocate-then-free is the canonical, ~microsecond way to force it.
313 let mut p: *mut core::ffi::c_void = core::ptr::null_mut();
314 // SAFETY: forces runtime primary-context creation on the current device.
315 let malloc_rc = unsafe { cudarc::runtime::sys::cudaMalloc(&mut p as *mut _ as *mut _, 256) };
316 log::trace!("[GPU] runtime cudaMalloc -> {malloc_rc:?}");
317 if !p.is_null() {
318 // SAFETY: `p` is the live device allocation returned just above.
319 let free_rc = unsafe { cudarc::runtime::sys::cudaFree(p) };
320 log::trace!("[GPU] runtime cudaFree -> {free_rc:?}");
321 }
322}
323
324#[cfg(target_os = "linux")]
325pub fn cuda_context_for(ordinal: usize) -> Option<Arc<CudaContext>> {
326 static CONTEXTS: OnceLock<Mutex<HashMap<usize, Arc<CudaContext>>>> = OnceLock::new();
327 let contexts = CONTEXTS.get_or_init(|| Mutex::new(HashMap::new()));
328 if let Some(ctx) = contexts.lock().ok()?.get(&ordinal).cloned() {
329 // Bind cudarc's PRIMARY context current on THIS thread BEFORE the runtime
330 // materialisation below, so the runtime initialises the same context that
331 // new_stream()/CudaBlas::new run cublasCreate against. Without this, on a
332 // fresh solve thread the cached path lets the runtime init its own device
333 // context, and the later cublasCreate on the primary-context stream fails
334 // CUBLAS/CUSOLVER_STATUS_NOT_INITIALIZED (the probe-first GPU-dead bug).
335 let bound = catch_unwind(AssertUnwindSafe(|| ctx.bind_to_thread()));
336 log::trace!(
337 "[GPU] cuda_context_for cached bind ok={}",
338 matches!(bound, Ok(Ok(())))
339 );
340 ensure_cuda_runtime_device(ordinal);
341 return Some(ctx);
342 }
343 // cudarc 0.19 panics from `panic_no_lib_found` if its loader fails to
344 // locate libcuda. Demote that to `None` so the runtime probe surfaces a
345 // typed `DriverUnavailable` rather than tearing down the worker thread.
346 let ctx = catch_unwind(AssertUnwindSafe(|| CudaContext::new(ordinal)))
347 .ok()?
348 .ok()?;
349 let out = {
350 let mut guard = contexts.lock().ok()?;
351 guard.entry(ordinal).or_insert_with(|| ctx.clone()).clone()
352 };
353 // CudaContext::new already bound the primary context, but the HashMap may return
354 // an entry created on another thread; rebind so the primary context is current on
355 // THIS thread before the runtime touch (same probe-first NOT_INITIALIZED guard).
356 let bound = catch_unwind(AssertUnwindSafe(|| out.bind_to_thread()));
357 log::trace!(
358 "[GPU] cuda_context_for fresh bind ok={}",
359 matches!(bound, Ok(Ok(())))
360 );
361 ensure_cuda_runtime_device(ordinal);
362 Some(out)
363}
364
365#[cfg(target_os = "linux")]
366fn cuda_device_info(ordinal: usize, ctx: &CudaContext) -> Result<GpuDeviceInfo, GpuError> {
367 result::init().map_err(|err| GpuError::DriverCallFailed {
368 reason: err.to_string(),
369 })?;
370 let device =
371 result::device::get(
372 i32::try_from(ordinal).map_err(|_| GpuError::DriverCallFailed {
373 reason: "device ordinal overflow".into(),
374 })?,
375 )
376 .map_err(|err| GpuError::DriverCallFailed {
377 reason: err.to_string(),
378 })?;
379 let attr = |attribute| -> Result<i32, GpuError> {
380 // SAFETY: device comes from cudarc's validated device::get.
381 unsafe { result::device::get_attribute(device, attribute) }.map_err(|err| {
382 GpuError::DriverCallFailed {
383 reason: err.to_string(),
384 }
385 })
386 };
387 let (free_mem_bytes, total_mem_bytes) =
388 ctx.mem_get_info()
389 .map_err(|err| GpuError::DriverCallFailed {
390 reason: err.to_string(),
391 })?;
392 let major = attr(sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)?;
393 let minor = attr(sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)?;
394 Ok(GpuDeviceInfo {
395 ordinal,
396 name: result::device::get_name(device).unwrap_or_else(|_| format!("CUDA device {ordinal}")),
397 capability: super::device::GpuCapability::from_compute_capability(major, minor),
398 sm_count: attr(sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)?,
399 max_threads_per_sm: attr(
400 sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
401 )?,
402 max_shared_mem_per_block: attr(
403 sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
404 )
405 .unwrap_or(0) as usize,
406 l2_cache_bytes: attr(sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE)
407 .unwrap_or(0) as usize,
408 total_mem_bytes,
409 free_mem_bytes,
410 ecc_enabled: attr(sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_ECC_ENABLED)
411 .unwrap_or(0)
412 != 0,
413 integrated: attr(sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_INTEGRATED).unwrap_or(0)
414 != 0,
415 mig_mode: false,
416 })
417}
418
419#[cfg(test)]
420mod module_path_lock_tests {
421 //! Locks the canonical module path for the GPU device runtime so a future
422 //! rename is a deliberate, reviewed change (precedent: issue #1157's
423 //! "lock module path" tests). This file was renamed from the generic,
424 //! colliding `gpu/runtime.rs` to `gpu/device_runtime.rs` under issue #1137.
425
426 #[test]
427 fn gpu_device_runtime_module_path_is_canonical() {
428 // Resolving `GpuRuntime` through the `device_runtime` module path
429 // pins the honest name; if the module is renamed this stops compiling.
430 _ = crate::device_runtime::GpuRuntime::is_available();
431 let type_name = std::any::type_name::<crate::device_runtime::GpuRuntime>();
432 assert!(
433 type_name.contains("device_runtime"),
434 "GpuRuntime must live in the `device_runtime` module (got {type_name})"
435 );
436 }
437}
438
439#[cfg(all(test, target_os = "linux"))]
440mod tests {
441 use super::*;
442
443 /// On a CPU-only host (no `libcuda.dylib` / `libcuda.so` reachable via the
444 /// platform loader), exercising every cudarc-touching entry point in this
445 /// crate must produce a clean `None`/`Err` and never trigger
446 /// `cudarc::panic_no_lib_found`. This is the regression guard for issues
447 /// #168 and #176, which observed a `PanicException` escaping the PyO3
448 /// boundary on macOS when `sae_manifold_fit(..., atom_basis="duchon")` or
449 /// `d_atom=1` ran on a host with no CUDA driver.
450 ///
451 /// On a host where libcuda *is* present the test still passes — it asserts
452 /// only that calls don't panic and that `is_culib_present()` agrees with
453 /// `GpuRuntime::is_available()` about the absence of a driver.
454 #[test]
455 fn cpu_only_host_never_panics_on_gpu_entry_points() {
456 // Without libcuda the runtime must report unavailable rather than
457 // panicking from inside `culib()`; with libcuda the runtime may or
458 // may not have a usable device, but the panic-free contract still
459 // holds and the dispatch smoke test below exercises it.
460 let culib_present = crate::driver::cuda_driver_library_present();
461 if !culib_present {
462 assert!(
463 !GpuRuntime::is_available(),
464 "is_culib_present()=false but GpuRuntime::is_available() returned true; \
465 the probe guard from c10e6636 has regressed and downstream cudarc \
466 calls will panic"
467 );
468 }
469
470 // Every public GPU dispatch must return a value (no panic) when the
471 // runtime is unavailable. We use minimum-size inputs so a host that
472 // *does* have a GPU still passes (workload below dispatch threshold
473 // → returns None / Err / CPU fallback the same way).
474 use ndarray::{Array1, Array2};
475 let a = Array2::<f64>::zeros((4, 3));
476 let b = Array2::<f64>::zeros((3, 2));
477 let v = Array1::<f64>::zeros(3);
478 let w = Array1::<f64>::ones(4);
479
480 // gpu::linalg_dispatch dispatchers
481 crate::try_fast_ab(a.view(), b.view());
482 crate::try_fast_av(a.view(), v.view());
483 crate::try_fast_atv(a.view(), w.view());
484 let mut chol_in = Array2::<f64>::eye(3);
485 crate::try_cholesky_lower_inplace(&mut chol_in);
486
487 // gpu::solver Cholesky entry points
488 let h = Array2::<f64>::eye(3);
489 let rhs = Array2::<f64>::zeros((3, 1));
490 let solve_outcome = crate::solver::cholesky_solve_gpu(h.view(), rhs.view());
491 let factor_outcome = crate::solver::cholesky_lower_gpu(h.view());
492 if !GpuRuntime::is_available() {
493 assert!(
494 solve_outcome.is_err(),
495 "cholesky_solve_gpu must Err when runtime is unavailable"
496 );
497 assert!(
498 factor_outcome.is_err(),
499 "cholesky_lower_gpu must Err when runtime is unavailable"
500 );
501 }
502
503 // NOTE: the weighted-crossprod GPU dispatcher with CPU fallback
504 // (`weighted_crossprod_gpu`) moved out of this crate to `gam-solve`
505 // (`gpu::pirls_gpu`) during the #1521 crate carve, since it depends on
506 // the higher-level PIRLS assembly. Its panic-free / Ok-via-CPU-fallback
507 // contract is now exercised by a regression test there
508 // (`weighted_crossprod_gpu_cpu_fallback_*`), not from gam-gpu, which
509 // cannot reach gam-solve without a dependency cycle.
510 }
511}