gam_gpu/
device_runtime.rs

1#[cfg(target_os = "linux")]
2use std::collections::HashMap;
3#[cfg(target_os = "linux")]
4use std::panic::{self, AssertUnwindSafe, catch_unwind};
5use std::sync::OnceLock;
6#[cfg(target_os = "linux")]
7use std::sync::{Arc, Mutex};
8
9use super::device::GpuDeviceInfo;
10use super::gpu_error::GpuError;
11use super::policy::GpuDispatchPolicy;
12#[cfg(target_os = "linux")]
13use cudarc::driver::{CudaContext, result, sys};
14
15#[path = "runtime_diagnostics.rs"]
16pub(crate) mod diagnostics;
17
18#[derive(Clone, Debug)]
19#[must_use]
20pub struct GpuRuntime {
21    /// Highest-scoring probed CUDA device. Existing dispatch code routes
22    /// one-shot kernels through this device.
23    pub device: GpuDeviceInfo,
24    /// All usable CUDA devices discovered at probe time, ordered by score.
25    pub devices: Vec<GpuDeviceInfo>,
26    pub policy: GpuDispatchPolicy,
27    pub memory_budget_bytes: usize,
28}
29
30static CPU_REASON: OnceLock<String> = OnceLock::new();
31
32/// Install a process-wide panic hook (idempotent) that drops cudarc's
33/// `panic_no_lib_found` message instead of writing it to stderr. All other
34/// panics flow to the previously installed hook unchanged. The site cudarc
35/// 0.19 panics from is `cudarc-0.19.7/src/lib.rs:200` inside its dynamic
36/// loader; messages from that path start with `Unable to dynamically load`.
37/// Caller code wraps the same cudarc entry points in `catch_unwind`, so the
38/// panic is recovered — this hook just prevents the stderr noise that made
39/// operators think the fit had crashed.
40#[cfg(target_os = "linux")]
41fn install_cudarc_panic_filter() {
42    static HOOK_INSTALLED: OnceLock<()> = OnceLock::new();
43    HOOK_INSTALLED.get_or_init(|| {
44        let prior = panic::take_hook();
45        panic::set_hook(Box::new(move |info| {
46            let payload = info.payload();
47            let message = payload
48                .downcast_ref::<&'static str>()
49                .copied()
50                .or_else(|| payload.downcast_ref::<String>().map(String::as_str))
51                .unwrap_or("");
52            if message.starts_with("Unable to dynamically load") {
53                return;
54            }
55            prior(info);
56        }));
57    });
58}
59
60impl GpuRuntime {
61    pub fn probe() -> Result<Option<Self>, GpuError> {
62        if super::global_policy() == super::GpuPolicy::Off {
63            Self::record_cpu_reason("GPU policy is off");
64            diagnostics::log_cuda_disabled("GPU policy is off");
65            return Ok(None);
66        }
67
68        #[cfg(not(target_os = "linux"))]
69        {
70            let reason = "CUDA support not compiled into this build";
71            Self::record_cpu_reason(reason);
72            diagnostics::log_cuda_disabled(reason);
73            return Err(GpuError::DriverLibraryUnavailable {
74                reason: reason.to_string(),
75            });
76        }
77
78        #[cfg(target_os = "linux")]
79        {
80            // `cudarc 0.19`'s entry points lazily initialize the CUDA driver
81            // through generated `culib()` helpers. On CPU-only Linux hosts the
82            // first such call emits `panic_no_lib_found` before unwinding, which
83            // polluted large-scale logs even when the panic was later caught and the
84            // fit fell back to CPU. Keep the preflight completely outside
85            // cudarc: use gam's own `libloading` probe first, and only touch
86            // cudarc after the platform loader can open `libcuda`.
87            //
88            // The preflight does not always agree with cudarc's own loader
89            // candidate list (e.g. large-scale workbench images expose CUDA *runtime*
90            // stub libraries under `/usr/local/cuda-*/targets/.../lib` but no
91            // driver `libcuda.so` in any loader path), so we additionally
92            // install a panic-hook filter that suppresses cudarc's
93            // `panic_no_lib_found` message and wrap every cudarc entry point
94            // below in `catch_unwind` to convert the panic into a typed
95            // `GpuError::DriverCallFailed` instead.
96            install_cudarc_panic_filter();
97            // #1017 probe-first fix: establish cudarc's primary context P and
98            // initialize the CUDA runtime ON IT as the VERY FIRST CUDA action -- before
99            // gam's libloading libcuda preload, the compute-lib dlopens, and device_count.
100            // The clean cuda_context_for-first path works; the probe-first path failed
101            // because a pre-context CUDA touch left the runtime bound to a non-P context,
102            // so later cuBLAS/cuSOLVER handle creation on the P-stream returned
103            // NOT_INITIALIZED. Making cuda_context_for the first action replicates the
104            // working clean path (CudaContext::new loads libcuda + retains the primary +
105            // ensure runs the runtime init); on a CPU-only host it returns None cleanly
106            // via the panic filter + catch_unwind, and the preload check below still runs.
107            let primary_ready = cuda_context_for(0).is_some();
108            log::trace!("[GPU] probe pre-init primary context + runtime: {primary_ready}");
109            if crate::driver::preload_cuda_driver().is_err() {
110                let reason = "libcuda unavailable";
111                Self::record_cpu_reason(reason);
112                log::info!("[GPU] CUDA acceleration disabled: {reason}");
113                diagnostics::log_cuda_disabled(reason);
114                return Err(GpuError::DriverLibraryUnavailable {
115                    reason: reason.to_string(),
116                });
117            }
118
119            // Driver-only environments (e.g. large-scale workbench images that expose
120            // `libcuda.so.1` but ship no cuBLAS/cuSOLVER/cuSPARSE) used to slip
121            // past the libcuda preflight, enable the runtime, and then panic
122            // out of cudarc's `panic_no_lib_found` on the first `CudaBlas::new`
123            // — the panic crossed the PyO3 FFI boundary as a
124            // `ValueError: fit_table panicked inside Rust boundary: Unable to
125            // dynamically load the "cublas" shared library`. The compute
126            // libraries are dispatch-critical (every cuBLAS / cuSOLVER /
127            // cuSPARSE site under `src/gpu/` calls `CudaBlas::new` /
128            // `DnHandle::new` / cusparse handle creation eagerly during
129            // workspace allocation), so we refuse to advertise GPU unless all
130            // three load cleanly here.
131            for stem in ["cublas", "cusolver", "cusparse"] {
132                if !crate::driver::cuda_compute_library_present(stem) {
133                    let reason = format!("lib{stem} unavailable");
134                    Self::record_cpu_reason(reason.clone());
135                    log::info!("[GPU] CUDA acceleration disabled: {reason}");
136                    diagnostics::log_cuda_disabled(&reason);
137                    return Err(GpuError::DriverLibraryUnavailable { reason });
138                }
139            }
140
141            // cudarc 0.19's `culib()` panics via `panic_no_lib_found` when its
142            // own (separate from gam's) dynamic-loader candidate list cannot
143            // find libcuda — this can happen even after our `preload_cuda_driver`
144            // succeeds, for example if our probe loaded a CUDA stub library but
145            // cudarc's loader searches a disjoint set of names. Convert any such
146            // panic into a typed probe failure so the runtime cleanly disables
147            // CUDA and the CPU fallback proceeds without alarming stderr noise.
148            let device_count = catch_unwind(AssertUnwindSafe(CudaContext::device_count))
149                .map_err(|_| GpuError::DriverLibraryUnavailable {
150                    reason: "libcuda unavailable".to_string(),
151                })?
152                .map_err(|err| GpuError::DriverCallFailed {
153                    reason: err.to_string(),
154                })?;
155            if device_count <= 0 {
156                let reason = "CUDA driver reported no devices";
157                Self::record_cpu_reason(reason);
158                diagnostics::log_cuda_disabled(reason);
159                // Surface the no-device state as a structured `DriverCallFailed`
160                // so callers wanting a CPU-reason marker can distinguish
161                // "policy off" (Ok(None)) from "driver present but no usable
162                // hardware" (Err). This keeps `GpuRuntime::probe()` honest: a
163                // successful `Ok` always carries at least one device.
164                return Err(GpuError::DriverCallFailed {
165                    reason: reason.to_string(),
166                });
167            }
168
169            let mut devices = Vec::new();
170            for ordinal in
171                0..usize::try_from(device_count).map_err(|_| GpuError::DriverCallFailed {
172                    reason: "negative CUDA device count".into(),
173                })?
174            {
175                let ctx = cuda_context_for(ordinal).ok_or_else(|| {
176                    gpu_err!("failed to create CUDA context for device {ordinal}")
177                })?;
178                catch_unwind(AssertUnwindSafe(|| ctx.bind_to_thread()))
179                    .map_err(|_| GpuError::DriverLibraryUnavailable {
180                        reason: "libcuda unavailable".to_string(),
181                    })?
182                    .map_err(|err| GpuError::DriverCallFailed {
183                        reason: err.to_string(),
184                    })?;
185                devices.push(
186                    catch_unwind(AssertUnwindSafe(|| cuda_device_info(ordinal, &ctx))).map_err(
187                        |_| GpuError::DriverLibraryUnavailable {
188                            reason: "libcuda unavailable".to_string(),
189                        },
190                    )??,
191                );
192            }
193
194            devices.sort_by(|a, b| b.score().total_cmp(&a.score()));
195            let Some(device) = devices.first().cloned() else {
196                Self::record_cpu_reason("CUDA driver reported no usable devices");
197                diagnostics::log_cuda_disabled("CUDA driver reported no usable devices");
198                return Ok(None);
199            };
200
201            let policy = crate::calibration::calibrated_policy_for_device(&device);
202            let memory_budget_bytes = device.memory_budget_bytes();
203            diagnostics::log_cuda_enabled(&device, &policy);
204            diagnostics::log_cuda_pool(&devices);
205
206            Ok(Some(Self {
207                device,
208                devices,
209                policy,
210                memory_budget_bytes,
211            }))
212        }
213    }
214
215    #[must_use]
216    pub fn global() -> Option<&'static Self> {
217        static RUNTIME: OnceLock<Option<GpuRuntime>> = OnceLock::new();
218        RUNTIME
219            .get_or_init(|| match Self::probe() {
220                Ok(runtime) => runtime,
221                Err(err) => {
222                    let reason = err.to_string();
223                    Self::record_cpu_reason(reason.clone());
224                    diagnostics::log_cuda_disabled(&reason);
225                    None
226                }
227            })
228            .as_ref()
229    }
230
231    #[must_use]
232    pub fn is_available() -> bool {
233        Self::global().is_some()
234    }
235
236    /// Fail-closed accessor for the process-wide runtime under a [`GpuMode`]
237    /// contract (issue #1017).
238    ///
239    /// * [`GpuMode::Required`] — the device MUST be present: when the probe
240    ///   found no usable runtime this returns `Err(GpuError::DriverLibraryUnavailable)`
241    ///   carrying the recorded CPU reason, so the resident path surfaces a
242    ///   structured error instead of silently falling back to the CPU.
243    /// * [`GpuMode::Auto`] / [`GpuMode::Off`] — preserve the existing
244    ///   probe-first behavior bit-for-bit: this is a thin wrapper over
245    ///   [`Self::global`] that maps the `None` case to the same typed error
246    ///   without ever forcing the runtime on or changing any numerics. `Auto`
247    ///   callers treat the `Err` exactly as they treated `global().is_none()`
248    ///   today (fall back to CPU); only the `Required` caller propagates it.
249    ///
250    /// This does NOT alter `global()`/`cuda_context_for`/`ensure_cuda_runtime_device`;
251    /// it only adds the residency gate on top of the working Auto path.
252    pub fn global_or_fail(mode: super::GpuMode) -> Result<&'static Self, GpuError> {
253        match mode {
254            super::GpuMode::Off => Err(GpuError::DriverLibraryUnavailable {
255                reason: "GPU residency mode is off".to_string(),
256            }),
257            super::GpuMode::Auto | super::GpuMode::Required => {
258                Self::global().ok_or_else(|| GpuError::DriverLibraryUnavailable {
259                    reason: Self::cpu_reason()
260                        .unwrap_or("CUDA runtime unavailable")
261                        .to_string(),
262                })
263            }
264        }
265    }
266
267    #[must_use]
268    pub fn policy(&self) -> &GpuDispatchPolicy {
269        &self.policy
270    }
271
272    #[must_use]
273    pub fn selected_device(&self) -> &GpuDeviceInfo {
274        &self.device
275    }
276
277    #[must_use]
278    pub(crate) fn cpu_reason() -> Option<&'static str> {
279        CPU_REASON.get().map(String::as_str)
280    }
281
282    fn record_cpu_reason(reason: impl Into<String>) {
283        CPU_REASON.set(reason.into()).ok();
284    }
285}
286
287/// Make the CUDA **runtime** API usable on `ordinal`.
288///
289/// gam drives the GPU through the CUDA *driver* API (cudarc [`CudaContext`]),
290/// which materialises the driver primary context but never selects a device for
291/// the CUDA *runtime* API. cuBLAS / cuSOLVER are runtime-based, so `cublasCreate`
292/// / `cusolverDnCreate` return `CUBLAS_STATUS_NOT_INITIALIZED` /
293/// `CUSOLVER_STATUS_NOT_INITIALIZED` until the runtime has a current device —
294/// which silently disables *every* GPU linear-algebra path (the dispatch sites
295/// map the handle error to `Unavailable` and fall back to CPU). We select the
296/// device on the calling host thread (cheap, idempotent) and force one-time
297/// runtime primary-context materialisation per device via the canonical
298/// `cudaMalloc`/`cudaFree` idiom, so every downstream handle creation succeeds.
299#[cfg(target_os = "linux")]
300fn ensure_cuda_runtime_device(ordinal: usize) {
301    let Ok(o) = i32::try_from(ordinal) else {
302        return;
303    };
304    // SAFETY: the `runtime` cudarc feature is enabled; cudaSetDevice on a valid
305    // ordinal is idempotent and per-host-thread.
306    let set_rc = unsafe { cudarc::runtime::sys::cudaSetDevice(o) };
307    log::trace!("[GPU] runtime cudaSetDevice({o}) -> {set_rc:?}");
308    // Materialise the runtime primary context on EVERY call (not once): the driver
309    // probe binds cudarc's own context, and cuBLAS/cuSOLVER `*Create` use whatever
310    // context is current at creation time, so the runtime device must be reselected
311    // and its primary context re-materialised immediately before each handle is made.
312    // A 256-byte allocate-then-free is the canonical, ~microsecond way to force it.
313    let mut p: *mut core::ffi::c_void = core::ptr::null_mut();
314    // SAFETY: forces runtime primary-context creation on the current device.
315    let malloc_rc = unsafe { cudarc::runtime::sys::cudaMalloc(&mut p as *mut _ as *mut _, 256) };
316    log::trace!("[GPU] runtime cudaMalloc -> {malloc_rc:?}");
317    if !p.is_null() {
318        // SAFETY: `p` is the live device allocation returned just above.
319        let free_rc = unsafe { cudarc::runtime::sys::cudaFree(p) };
320        log::trace!("[GPU] runtime cudaFree -> {free_rc:?}");
321    }
322}
323
324#[cfg(target_os = "linux")]
325pub fn cuda_context_for(ordinal: usize) -> Option<Arc<CudaContext>> {
326    static CONTEXTS: OnceLock<Mutex<HashMap<usize, Arc<CudaContext>>>> = OnceLock::new();
327    let contexts = CONTEXTS.get_or_init(|| Mutex::new(HashMap::new()));
328    if let Some(ctx) = contexts.lock().ok()?.get(&ordinal).cloned() {
329        // Bind cudarc's PRIMARY context current on THIS thread BEFORE the runtime
330        // materialisation below, so the runtime initialises the same context that
331        // new_stream()/CudaBlas::new run cublasCreate against. Without this, on a
332        // fresh solve thread the cached path lets the runtime init its own device
333        // context, and the later cublasCreate on the primary-context stream fails
334        // CUBLAS/CUSOLVER_STATUS_NOT_INITIALIZED (the probe-first GPU-dead bug).
335        let bound = catch_unwind(AssertUnwindSafe(|| ctx.bind_to_thread()));
336        log::trace!(
337            "[GPU] cuda_context_for cached bind ok={}",
338            matches!(bound, Ok(Ok(())))
339        );
340        ensure_cuda_runtime_device(ordinal);
341        return Some(ctx);
342    }
343    // cudarc 0.19 panics from `panic_no_lib_found` if its loader fails to
344    // locate libcuda. Demote that to `None` so the runtime probe surfaces a
345    // typed `DriverUnavailable` rather than tearing down the worker thread.
346    let ctx = catch_unwind(AssertUnwindSafe(|| CudaContext::new(ordinal)))
347        .ok()?
348        .ok()?;
349    let out = {
350        let mut guard = contexts.lock().ok()?;
351        guard.entry(ordinal).or_insert_with(|| ctx.clone()).clone()
352    };
353    // CudaContext::new already bound the primary context, but the HashMap may return
354    // an entry created on another thread; rebind so the primary context is current on
355    // THIS thread before the runtime touch (same probe-first NOT_INITIALIZED guard).
356    let bound = catch_unwind(AssertUnwindSafe(|| out.bind_to_thread()));
357    log::trace!(
358        "[GPU] cuda_context_for fresh bind ok={}",
359        matches!(bound, Ok(Ok(())))
360    );
361    ensure_cuda_runtime_device(ordinal);
362    Some(out)
363}
364
365#[cfg(target_os = "linux")]
366fn cuda_device_info(ordinal: usize, ctx: &CudaContext) -> Result<GpuDeviceInfo, GpuError> {
367    result::init().map_err(|err| GpuError::DriverCallFailed {
368        reason: err.to_string(),
369    })?;
370    let device =
371        result::device::get(
372            i32::try_from(ordinal).map_err(|_| GpuError::DriverCallFailed {
373                reason: "device ordinal overflow".into(),
374            })?,
375        )
376        .map_err(|err| GpuError::DriverCallFailed {
377            reason: err.to_string(),
378        })?;
379    let attr = |attribute| -> Result<i32, GpuError> {
380        // SAFETY: device comes from cudarc's validated device::get.
381        unsafe { result::device::get_attribute(device, attribute) }.map_err(|err| {
382            GpuError::DriverCallFailed {
383                reason: err.to_string(),
384            }
385        })
386    };
387    let (free_mem_bytes, total_mem_bytes) =
388        ctx.mem_get_info()
389            .map_err(|err| GpuError::DriverCallFailed {
390                reason: err.to_string(),
391            })?;
392    let major = attr(sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)?;
393    let minor = attr(sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)?;
394    Ok(GpuDeviceInfo {
395        ordinal,
396        name: result::device::get_name(device).unwrap_or_else(|_| format!("CUDA device {ordinal}")),
397        capability: super::device::GpuCapability::from_compute_capability(major, minor),
398        sm_count: attr(sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)?,
399        max_threads_per_sm: attr(
400            sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
401        )?,
402        max_shared_mem_per_block: attr(
403            sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
404        )
405        .unwrap_or(0) as usize,
406        l2_cache_bytes: attr(sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE)
407            .unwrap_or(0) as usize,
408        total_mem_bytes,
409        free_mem_bytes,
410        ecc_enabled: attr(sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_ECC_ENABLED)
411            .unwrap_or(0)
412            != 0,
413        integrated: attr(sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_INTEGRATED).unwrap_or(0)
414            != 0,
415        mig_mode: false,
416    })
417}
418
419#[cfg(test)]
420mod module_path_lock_tests {
421    //! Locks the canonical module path for the GPU device runtime so a future
422    //! rename is a deliberate, reviewed change (precedent: issue #1157's
423    //! "lock module path" tests). This file was renamed from the generic,
424    //! colliding `gpu/runtime.rs` to `gpu/device_runtime.rs` under issue #1137.
425
426    #[test]
427    fn gpu_device_runtime_module_path_is_canonical() {
428        // Resolving `GpuRuntime` through the `device_runtime` module path
429        // pins the honest name; if the module is renamed this stops compiling.
430        _ = crate::device_runtime::GpuRuntime::is_available();
431        let type_name = std::any::type_name::<crate::device_runtime::GpuRuntime>();
432        assert!(
433            type_name.contains("device_runtime"),
434            "GpuRuntime must live in the `device_runtime` module (got {type_name})"
435        );
436    }
437}
438
439#[cfg(all(test, target_os = "linux"))]
440mod tests {
441    use super::*;
442
443    /// On a CPU-only host (no `libcuda.dylib` / `libcuda.so` reachable via the
444    /// platform loader), exercising every cudarc-touching entry point in this
445    /// crate must produce a clean `None`/`Err` and never trigger
446    /// `cudarc::panic_no_lib_found`. This is the regression guard for issues
447    /// #168 and #176, which observed a `PanicException` escaping the PyO3
448    /// boundary on macOS when `sae_manifold_fit(..., atom_basis="duchon")` or
449    /// `d_atom=1` ran on a host with no CUDA driver.
450    ///
451    /// On a host where libcuda *is* present the test still passes — it asserts
452    /// only that calls don't panic and that `is_culib_present()` agrees with
453    /// `GpuRuntime::is_available()` about the absence of a driver.
454    #[test]
455    fn cpu_only_host_never_panics_on_gpu_entry_points() {
456        // Without libcuda the runtime must report unavailable rather than
457        // panicking from inside `culib()`; with libcuda the runtime may or
458        // may not have a usable device, but the panic-free contract still
459        // holds and the dispatch smoke test below exercises it.
460        let culib_present = crate::driver::cuda_driver_library_present();
461        if !culib_present {
462            assert!(
463                !GpuRuntime::is_available(),
464                "is_culib_present()=false but GpuRuntime::is_available() returned true; \
465                 the probe guard from c10e6636 has regressed and downstream cudarc \
466                 calls will panic"
467            );
468        }
469
470        // Every public GPU dispatch must return a value (no panic) when the
471        // runtime is unavailable. We use minimum-size inputs so a host that
472        // *does* have a GPU still passes (workload below dispatch threshold
473        // → returns None / Err / CPU fallback the same way).
474        use ndarray::{Array1, Array2};
475        let a = Array2::<f64>::zeros((4, 3));
476        let b = Array2::<f64>::zeros((3, 2));
477        let v = Array1::<f64>::zeros(3);
478        let w = Array1::<f64>::ones(4);
479
480        // gpu::linalg_dispatch dispatchers
481        crate::try_fast_ab(a.view(), b.view());
482        crate::try_fast_av(a.view(), v.view());
483        crate::try_fast_atv(a.view(), w.view());
484        let mut chol_in = Array2::<f64>::eye(3);
485        crate::try_cholesky_lower_inplace(&mut chol_in);
486
487        // gpu::solver Cholesky entry points
488        let h = Array2::<f64>::eye(3);
489        let rhs = Array2::<f64>::zeros((3, 1));
490        let solve_outcome = crate::solver::cholesky_solve_gpu(h.view(), rhs.view());
491        let factor_outcome = crate::solver::cholesky_lower_gpu(h.view());
492        if !GpuRuntime::is_available() {
493            assert!(
494                solve_outcome.is_err(),
495                "cholesky_solve_gpu must Err when runtime is unavailable"
496            );
497            assert!(
498                factor_outcome.is_err(),
499                "cholesky_lower_gpu must Err when runtime is unavailable"
500            );
501        }
502
503        // NOTE: the weighted-crossprod GPU dispatcher with CPU fallback
504        // (`weighted_crossprod_gpu`) moved out of this crate to `gam-solve`
505        // (`gpu::pirls_gpu`) during the #1521 crate carve, since it depends on
506        // the higher-level PIRLS assembly. Its panic-free / Ok-via-CPU-fallback
507        // contract is now exercised by a regression test there
508        // (`weighted_crossprod_gpu_cpu_fallback_*`), not from gam-gpu, which
509        // cannot reach gam-solve without a dependency cycle.
510    }
511}
gam_gpu/device_runtime.rs

gam_gpu/
device_runtime.rs