Skip to main content

gam_gpu/
device_runtime.rs

1#[cfg(target_os = "linux")]
2use std::collections::HashMap;
3#[cfg(target_os = "linux")]
4use std::panic::{self, AssertUnwindSafe, catch_unwind};
5use std::sync::OnceLock;
6#[cfg(target_os = "linux")]
7use std::sync::{Arc, Mutex};
8
9use super::device::GpuDeviceInfo;
10use super::gpu_error::GpuError;
11use super::policy::GpuDispatchPolicy;
12#[cfg(target_os = "linux")]
13use cudarc::driver::{CudaContext, result, sys};
14
15#[path = "runtime_diagnostics.rs"]
16pub(crate) mod diagnostics;
17
18#[derive(Clone, Debug)]
19#[must_use]
20pub struct GpuRuntime {
21    /// Highest-scoring probed CUDA device. Existing dispatch code routes
22    /// one-shot kernels through this device.
23    pub device: GpuDeviceInfo,
24    /// All usable CUDA devices discovered at probe time, ordered by score.
25    pub devices: Vec<GpuDeviceInfo>,
26    pub policy: GpuDispatchPolicy,
27    pub memory_budget_bytes: usize,
28}
29
30static CPU_REASON: OnceLock<String> = OnceLock::new();
31
32/// Install a process-wide panic hook (idempotent) that drops cudarc's
33/// `panic_no_lib_found` message instead of writing it to stderr. All other
34/// panics flow to the previously installed hook unchanged. The site cudarc
35/// 0.19 panics from is `cudarc-0.19.7/src/lib.rs:200` inside its dynamic
36/// loader; messages from that path start with `Unable to dynamically load`.
37/// Caller code wraps the same cudarc entry points in `catch_unwind`, so the
38/// panic is recovered — this hook just prevents the stderr noise that made
39/// operators think the fit had crashed.
40#[cfg(target_os = "linux")]
41fn install_cudarc_panic_filter() {
42    static HOOK_INSTALLED: OnceLock<()> = OnceLock::new();
43    HOOK_INSTALLED.get_or_init(|| {
44        let prior = panic::take_hook();
45        panic::set_hook(Box::new(move |info| {
46            let payload = info.payload();
47            let message = payload
48                .downcast_ref::<&'static str>()
49                .copied()
50                .or_else(|| payload.downcast_ref::<String>().map(String::as_str))
51                .unwrap_or("");
52            if message.starts_with("Unable to dynamically load") {
53                return;
54            }
55            prior(info);
56        }));
57    });
58}
59
60impl GpuRuntime {
61    pub fn probe() -> Result<Option<Self>, GpuError> {
62        if super::global_policy() == super::GpuPolicy::Off {
63            Self::record_cpu_reason("GPU policy is off");
64            diagnostics::log_cuda_disabled("GPU policy is off");
65            return Ok(None);
66        }
67
68        #[cfg(not(target_os = "linux"))]
69        {
70            let reason = "CUDA support not compiled into this build";
71            Self::record_cpu_reason(reason);
72            diagnostics::log_cuda_disabled(reason);
73            return Err(GpuError::DriverLibraryUnavailable {
74                reason: reason.to_string(),
75            });
76        }
77
78        #[cfg(target_os = "linux")]
79        {
80            // `cudarc 0.19`'s entry points lazily initialize the CUDA driver
81            // through generated `culib()` helpers. On CPU-only Linux hosts the
82            // first such call emits `panic_no_lib_found` before unwinding, which
83            // polluted large-scale logs even when the panic was later caught and the
84            // fit fell back to CPU. Keep the preflight completely outside
85            // cudarc: use gam's own `libloading` probe first, and only touch
86            // cudarc after the platform loader can open `libcuda`.
87            //
88            // The preflight does not always agree with cudarc's own loader
89            // candidate list (e.g. large-scale workbench images expose CUDA *runtime*
90            // stub libraries under `/usr/local/cuda-*/targets/.../lib` but no
91            // driver `libcuda.so` in any loader path), so we additionally
92            // install a panic-hook filter that suppresses cudarc's
93            // `panic_no_lib_found` message and wrap every cudarc entry point
94            // below in `catch_unwind` to convert the panic into a typed
95            // `GpuError::DriverCallFailed` instead.
96            install_cudarc_panic_filter();
97            // #1017 probe-first fix: establish cudarc's primary context P and
98            // initialize the CUDA runtime ON IT as the VERY FIRST CUDA action -- before
99            // gam's libloading libcuda preload, the compute-lib dlopens, and device_count.
100            // The clean cuda_context_for-first path works; the probe-first path failed
101            // because a pre-context CUDA touch left the runtime bound to a non-P context,
102            // so later cuBLAS/cuSOLVER handle creation on the P-stream returned
103            // NOT_INITIALIZED. Making cuda_context_for the first action replicates the
104            // working clean path (CudaContext::new loads libcuda + retains the primary +
105            // ensure runs the runtime init); on a CPU-only host it returns None cleanly
106            // via the panic filter + catch_unwind, and the preload check below still runs.
107            let primary_ready = cuda_context_for(0).is_some();
108            log::trace!("[GPU] probe pre-init primary context + runtime: {primary_ready}");
109            if crate::driver::preload_cuda_driver().is_err() {
110                let reason = "libcuda unavailable";
111                Self::record_cpu_reason(reason);
112                log::info!("[GPU] CUDA acceleration disabled: {reason}");
113                diagnostics::log_cuda_disabled(reason);
114                return Err(GpuError::DriverLibraryUnavailable {
115                    reason: reason.to_string(),
116                });
117            }
118
119            // Driver-only environments (e.g. large-scale workbench images that expose
120            // `libcuda.so.1` but ship no cuBLAS/cuSOLVER/cuSPARSE) used to slip
121            // past the libcuda preflight, enable the runtime, and then panic
122            // out of cudarc's `panic_no_lib_found` on the first `CudaBlas::new`
123            // — the panic crossed the PyO3 FFI boundary as a
124            // `ValueError: fit_table panicked inside Rust boundary: Unable to
125            // dynamically load the "cublas" shared library`. The compute
126            // libraries are dispatch-critical (every cuBLAS / cuSOLVER /
127            // cuSPARSE site under `src/gpu/` calls `CudaBlas::new` /
128            // `DnHandle::new` / cusparse handle creation eagerly during
129            // workspace allocation), so we refuse to advertise GPU unless all
130            // three load cleanly here.
131            for stem in ["cublas", "cusolver", "cusparse"] {
132                if !crate::driver::cuda_compute_library_present(stem) {
133                    let reason = format!("lib{stem} unavailable");
134                    Self::record_cpu_reason(reason.clone());
135                    log::info!("[GPU] CUDA acceleration disabled: {reason}");
136                    diagnostics::log_cuda_disabled(&reason);
137                    return Err(GpuError::DriverLibraryUnavailable { reason });
138                }
139            }
140
141            // cudarc 0.19's `culib()` panics via `panic_no_lib_found` when its
142            // own (separate from gam's) dynamic-loader candidate list cannot
143            // find libcuda — this can happen even after our `preload_cuda_driver`
144            // succeeds, for example if our probe loaded a CUDA stub library but
145            // cudarc's loader searches a disjoint set of names. Convert any such
146            // panic into a typed probe failure so the runtime cleanly disables
147            // CUDA and the CPU fallback proceeds without alarming stderr noise.
148            let device_count = catch_unwind(AssertUnwindSafe(CudaContext::device_count))
149                .map_err(|_| GpuError::DriverLibraryUnavailable {
150                    reason: "libcuda unavailable".to_string(),
151                })?
152                .map_err(|err| GpuError::DriverCallFailed {
153                    reason: err.to_string(),
154                })?;
155            if device_count <= 0 {
156                let reason = "CUDA driver reported no devices";
157                Self::record_cpu_reason(reason);
158                diagnostics::log_cuda_disabled(reason);
159                // Surface the no-device state as a structured `DriverCallFailed`
160                // so callers wanting a CPU-reason marker can distinguish
161                // "policy off" (Ok(None)) from "driver present but no usable
162                // hardware" (Err). This keeps `GpuRuntime::probe()` honest: a
163                // successful `Ok` always carries at least one device.
164                return Err(GpuError::DriverCallFailed {
165                    reason: reason.to_string(),
166                });
167            }
168
169            let mut devices = Vec::new();
170            for ordinal in
171                0..usize::try_from(device_count).map_err(|_| GpuError::DriverCallFailed {
172                    reason: "negative CUDA device count".into(),
173                })?
174            {
175                let ctx = cuda_context_for(ordinal).ok_or_else(|| {
176                    gpu_err!("failed to create CUDA context for device {ordinal}")
177                })?;
178                catch_unwind(AssertUnwindSafe(|| ctx.bind_to_thread()))
179                    .map_err(|_| GpuError::DriverLibraryUnavailable {
180                        reason: "libcuda unavailable".to_string(),
181                    })?
182                    .map_err(|err| GpuError::DriverCallFailed {
183                        reason: err.to_string(),
184                    })?;
185                devices.push(
186                    catch_unwind(AssertUnwindSafe(|| cuda_device_info(ordinal, &ctx))).map_err(
187                        |_| GpuError::DriverLibraryUnavailable {
188                            reason: "libcuda unavailable".to_string(),
189                        },
190                    )??,
191                );
192            }
193
194            devices.sort_by(|a, b| b.score().total_cmp(&a.score()));
195            let Some(device) = devices.first().cloned() else {
196                Self::record_cpu_reason("CUDA driver reported no usable devices");
197                diagnostics::log_cuda_disabled("CUDA driver reported no usable devices");
198                return Ok(None);
199            };
200
201            let policy = crate::calibration::calibrated_policy_for_device(&device);
202            let memory_budget_bytes = device.memory_budget_bytes();
203            diagnostics::log_cuda_enabled(&device, &policy);
204            diagnostics::log_cuda_pool(&devices);
205
206            Ok(Some(Self {
207                device,
208                devices,
209                policy,
210                memory_budget_bytes,
211            }))
212        }
213    }
214
215    #[must_use]
216    pub fn global() -> Option<&'static Self> {
217        static RUNTIME: OnceLock<Option<GpuRuntime>> = OnceLock::new();
218        RUNTIME
219            .get_or_init(|| {
220                let runtime = match Self::probe() {
221                    Ok(runtime) => runtime,
222                    Err(err) => {
223                        let reason = err.to_string();
224                        Self::record_cpu_reason(reason.clone());
225                        diagnostics::log_cuda_disabled(&reason);
226                        None
227                    }
228                };
229                // Install the dense-GEMM dispatch hook exactly when a usable
230                // device was probed. Without this, `gam_linalg::faer_ndarray::fast_ab`
231                // (and the `fast_atb`/`fast_av`/`xt_diag_x` family) never sees a
232                // dispatcher — `gpu_dispatch()` stays `None` — so every dense
233                // product in the engine silently runs on the CPU even when the
234                // V100 is present and the workload clears the policy flop floor.
235                // The hook is a first-write-wins `OnceLock` keyed only on the
236                // presence of a runtime; registering it here, inside the same
237                // `get_or_init` that decides the runtime, guarantees it is
238                // installed before any `fast_ab` caller can observe a `Some`
239                // runtime. The policy gate inside each `try_*` still decides
240                // CPU-vs-GPU per call, so small products are unaffected.
241                if runtime.is_some() {
242                    gam_linalg::gpu_hook::register_gpu_dispatch(Box::new(
243                        super::linalg_dispatch::CudaGemmDispatch,
244                    ));
245                }
246                runtime
247            })
248            .as_ref()
249    }
250
251    #[must_use]
252    pub fn is_available() -> bool {
253        Self::global().is_some()
254    }
255
256    /// Fail-closed accessor for the process-wide runtime under a [`GpuMode`]
257    /// contract (issue #1017).
258    ///
259    /// * [`GpuMode::Required`] — the device MUST be present: when the probe
260    ///   found no usable runtime this returns `Err(GpuError::DriverLibraryUnavailable)`
261    ///   carrying the recorded CPU reason, so the resident path surfaces a
262    ///   structured error instead of silently falling back to the CPU.
263    /// * [`GpuMode::Auto`] / [`GpuMode::Off`] — preserve the existing
264    ///   probe-first behavior bit-for-bit: this is a thin wrapper over
265    ///   [`Self::global`] that maps the `None` case to the same typed error
266    ///   without ever forcing the runtime on or changing any numerics. `Auto`
267    ///   callers treat the `Err` exactly as they treated `global().is_none()`
268    ///   today (fall back to CPU); only the `Required` caller propagates it.
269    ///
270    /// This does NOT alter `global()`/`cuda_context_for`/`ensure_cuda_runtime_device`;
271    /// it only adds the residency gate on top of the working Auto path.
272    pub fn global_or_fail(mode: super::GpuMode) -> Result<&'static Self, GpuError> {
273        match mode {
274            super::GpuMode::Off => Err(GpuError::DriverLibraryUnavailable {
275                reason: "GPU residency mode is off".to_string(),
276            }),
277            super::GpuMode::Auto | super::GpuMode::Required => {
278                Self::global().ok_or_else(|| GpuError::DriverLibraryUnavailable {
279                    reason: Self::cpu_reason()
280                        .unwrap_or("CUDA runtime unavailable")
281                        .to_string(),
282                })
283            }
284        }
285    }
286
287    #[must_use]
288    pub fn policy(&self) -> &GpuDispatchPolicy {
289        &self.policy
290    }
291
292    #[must_use]
293    pub fn selected_device(&self) -> &GpuDeviceInfo {
294        &self.device
295    }
296
297    #[must_use]
298    pub(crate) fn cpu_reason() -> Option<&'static str> {
299        CPU_REASON.get().map(String::as_str)
300    }
301
302    fn record_cpu_reason(reason: impl Into<String>) {
303        CPU_REASON.set(reason.into()).ok();
304    }
305}
306
307/// Make the CUDA **runtime** API usable on `ordinal`.
308///
309/// gam drives the GPU through the CUDA *driver* API (cudarc [`CudaContext`]),
310/// which materialises the driver primary context but never selects a device for
311/// the CUDA *runtime* API. cuBLAS / cuSOLVER are runtime-based, so `cublasCreate`
312/// / `cusolverDnCreate` return `CUBLAS_STATUS_NOT_INITIALIZED` /
313/// `CUSOLVER_STATUS_NOT_INITIALIZED` until the runtime has a current device —
314/// which silently disables *every* GPU linear-algebra path (the dispatch sites
315/// map the handle error to `Unavailable` and fall back to CPU). We select the
316/// device on the calling host thread (cheap, idempotent) and force one-time
317/// runtime primary-context materialisation per device via the canonical
318/// `cudaMalloc`/`cudaFree` idiom, so every downstream handle creation succeeds.
319#[cfg(target_os = "linux")]
320fn ensure_cuda_runtime_device(ordinal: usize) {
321    let Ok(o) = i32::try_from(ordinal) else {
322        return;
323    };
324    // SAFETY: the `runtime` cudarc feature is enabled; cudaSetDevice on a valid
325    // ordinal is idempotent and per-host-thread.
326    let set_rc = unsafe { cudarc::runtime::sys::cudaSetDevice(o) };
327    log::trace!("[GPU] runtime cudaSetDevice({o}) -> {set_rc:?}");
328    // Materialise the runtime primary context on EVERY call (not once): the driver
329    // probe binds cudarc's own context, and cuBLAS/cuSOLVER `*Create` use whatever
330    // context is current at creation time, so the runtime device must be reselected
331    // and its primary context re-materialised immediately before each handle is made.
332    // A 256-byte allocate-then-free is the canonical, ~microsecond way to force it.
333    let mut p: *mut core::ffi::c_void = core::ptr::null_mut();
334    // SAFETY: forces runtime primary-context creation on the current device.
335    let malloc_rc = unsafe { cudarc::runtime::sys::cudaMalloc(&mut p as *mut _ as *mut _, 256) };
336    log::trace!("[GPU] runtime cudaMalloc -> {malloc_rc:?}");
337    if !p.is_null() {
338        // SAFETY: `p` is the live device allocation returned just above.
339        let free_rc = unsafe { cudarc::runtime::sys::cudaFree(p) };
340        log::trace!("[GPU] runtime cudaFree -> {free_rc:?}");
341    }
342}
343
344#[cfg(target_os = "linux")]
345pub fn cuda_context_for(ordinal: usize) -> Option<Arc<CudaContext>> {
346    static CONTEXTS: OnceLock<Mutex<HashMap<usize, Arc<CudaContext>>>> = OnceLock::new();
347    let contexts = CONTEXTS.get_or_init(|| Mutex::new(HashMap::new()));
348    if let Some(ctx) = contexts.lock().ok()?.get(&ordinal).cloned() {
349        // Bind cudarc's PRIMARY context current on THIS thread BEFORE the runtime
350        // materialisation below, so the runtime initialises the same context that
351        // new_stream()/CudaBlas::new run cublasCreate against. Without this, on a
352        // fresh solve thread the cached path lets the runtime init its own device
353        // context, and the later cublasCreate on the primary-context stream fails
354        // CUBLAS/CUSOLVER_STATUS_NOT_INITIALIZED (the probe-first GPU-dead bug).
355        let bound = catch_unwind(AssertUnwindSafe(|| ctx.bind_to_thread()));
356        log::trace!(
357            "[GPU] cuda_context_for cached bind ok={}",
358            matches!(bound, Ok(Ok(())))
359        );
360        ensure_cuda_runtime_device(ordinal);
361        return Some(ctx);
362    }
363    // cudarc 0.19 panics from `panic_no_lib_found` if its loader fails to
364    // locate libcuda. Demote that to `None` so the runtime probe surfaces a
365    // typed `DriverUnavailable` rather than tearing down the worker thread.
366    let ctx = catch_unwind(AssertUnwindSafe(|| CudaContext::new(ordinal)))
367        .ok()?
368        .ok()?;
369    let out = {
370        let mut guard = contexts.lock().ok()?;
371        guard.entry(ordinal).or_insert_with(|| ctx.clone()).clone()
372    };
373    // CudaContext::new already bound the primary context, but the HashMap may return
374    // an entry created on another thread; rebind so the primary context is current on
375    // THIS thread before the runtime touch (same probe-first NOT_INITIALIZED guard).
376    let bound = catch_unwind(AssertUnwindSafe(|| out.bind_to_thread()));
377    log::trace!(
378        "[GPU] cuda_context_for fresh bind ok={}",
379        matches!(bound, Ok(Ok(())))
380    );
381    ensure_cuda_runtime_device(ordinal);
382    Some(out)
383}
384
385#[cfg(target_os = "linux")]
386fn cuda_device_info(ordinal: usize, ctx: &CudaContext) -> Result<GpuDeviceInfo, GpuError> {
387    result::init().map_err(|err| GpuError::DriverCallFailed {
388        reason: err.to_string(),
389    })?;
390    let device =
391        result::device::get(
392            i32::try_from(ordinal).map_err(|_| GpuError::DriverCallFailed {
393                reason: "device ordinal overflow".into(),
394            })?,
395        )
396        .map_err(|err| GpuError::DriverCallFailed {
397            reason: err.to_string(),
398        })?;
399    let attr = |attribute| -> Result<i32, GpuError> {
400        // SAFETY: device comes from cudarc's validated device::get.
401        unsafe { result::device::get_attribute(device, attribute) }.map_err(|err| {
402            GpuError::DriverCallFailed {
403                reason: err.to_string(),
404            }
405        })
406    };
407    let (free_mem_bytes, total_mem_bytes) =
408        ctx.mem_get_info()
409            .map_err(|err| GpuError::DriverCallFailed {
410                reason: err.to_string(),
411            })?;
412    let major = attr(sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR)?;
413    let minor = attr(sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR)?;
414    Ok(GpuDeviceInfo {
415        ordinal,
416        name: result::device::get_name(device).unwrap_or_else(|_| format!("CUDA device {ordinal}")),
417        capability: super::device::GpuCapability::from_compute_capability(major, minor),
418        sm_count: attr(sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT)?,
419        max_threads_per_sm: attr(
420            sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
421        )?,
422        max_shared_mem_per_block: attr(
423            sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
424        )
425        .unwrap_or(0) as usize,
426        l2_cache_bytes: attr(sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE)
427            .unwrap_or(0) as usize,
428        total_mem_bytes,
429        free_mem_bytes,
430        ecc_enabled: attr(sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_ECC_ENABLED)
431            .unwrap_or(0)
432            != 0,
433        integrated: attr(sys::CUdevice_attribute_enum::CU_DEVICE_ATTRIBUTE_INTEGRATED).unwrap_or(0)
434            != 0,
435        mig_mode: false,
436    })
437}
438
439#[cfg(test)]
440mod module_path_lock_tests {
441    //! Locks the canonical module path for the GPU device runtime so a future
442    //! rename is a deliberate, reviewed change (precedent: issue #1157's
443    //! "lock module path" tests). This file was renamed from the generic,
444    //! colliding `gpu/runtime.rs` to `gpu/device_runtime.rs` under issue #1137.
445
446    #[test]
447    fn gpu_device_runtime_module_path_is_canonical() {
448        // Resolving `GpuRuntime` through the `device_runtime` module path
449        // pins the honest name; if the module is renamed this stops compiling.
450        _ = crate::device_runtime::GpuRuntime::is_available();
451        let type_name = std::any::type_name::<crate::device_runtime::GpuRuntime>();
452        assert!(
453            type_name.contains("device_runtime"),
454            "GpuRuntime must live in the `device_runtime` module (got {type_name})"
455        );
456    }
457}
458
459#[cfg(all(test, target_os = "linux"))]
460mod tests {
461    use super::*;
462
463    /// On a CPU-only host (no `libcuda.dylib` / `libcuda.so` reachable via the
464    /// platform loader), exercising every cudarc-touching entry point in this
465    /// crate must produce a clean `None`/`Err` and never trigger
466    /// `cudarc::panic_no_lib_found`. This is the regression guard for issues
467    /// #168 and #176, which observed a `PanicException` escaping the PyO3
468    /// boundary on macOS when `sae_manifold_fit(..., atom_basis="duchon")` or
469    /// `d_atom=1` ran on a host with no CUDA driver.
470    ///
471    /// On a host where libcuda *is* present the test still passes — it asserts
472    /// only that calls don't panic and that `is_culib_present()` agrees with
473    /// `GpuRuntime::is_available()` about the absence of a driver.
474    #[test]
475    fn cpu_only_host_never_panics_on_gpu_entry_points() {
476        // Without libcuda the runtime must report unavailable rather than
477        // panicking from inside `culib()`; with libcuda the runtime may or
478        // may not have a usable device, but the panic-free contract still
479        // holds and the dispatch smoke test below exercises it.
480        let culib_present = crate::driver::cuda_driver_library_present();
481        if !culib_present {
482            assert!(
483                !GpuRuntime::is_available(),
484                "is_culib_present()=false but GpuRuntime::is_available() returned true; \
485                 the probe guard from c10e6636 has regressed and downstream cudarc \
486                 calls will panic"
487            );
488        }
489
490        // Every public GPU dispatch must return a value (no panic) when the
491        // runtime is unavailable. We use minimum-size inputs so a host that
492        // *does* have a GPU still passes (workload below dispatch threshold
493        // → returns None / Err / CPU fallback the same way).
494        use ndarray::{Array1, Array2};
495        let a = Array2::<f64>::zeros((4, 3));
496        let b = Array2::<f64>::zeros((3, 2));
497        let v = Array1::<f64>::zeros(3);
498        let w = Array1::<f64>::ones(4);
499
500        // gpu::linalg_dispatch dispatchers
501        crate::try_fast_ab(a.view(), b.view());
502        crate::try_fast_av(a.view(), v.view());
503        crate::try_fast_atv(a.view(), w.view());
504        let mut chol_in = Array2::<f64>::eye(3);
505        crate::try_cholesky_lower_inplace(&mut chol_in);
506
507        // gpu::solver Cholesky entry points
508        let h = Array2::<f64>::eye(3);
509        let rhs = Array2::<f64>::zeros((3, 1));
510        let solve_outcome = crate::solver::cholesky_solve_gpu(h.view(), rhs.view());
511        let factor_outcome = crate::solver::cholesky_lower_gpu(h.view());
512        if !GpuRuntime::is_available() {
513            assert!(
514                solve_outcome.is_err(),
515                "cholesky_solve_gpu must Err when runtime is unavailable"
516            );
517            assert!(
518                factor_outcome.is_err(),
519                "cholesky_lower_gpu must Err when runtime is unavailable"
520            );
521        }
522
523        // NOTE: the weighted-crossprod GPU dispatcher with CPU fallback
524        // (`weighted_crossprod_gpu`) moved out of this crate to `gam-solve`
525        // (`gpu::pirls_gpu`) during the #1521 crate carve, since it depends on
526        // the higher-level PIRLS assembly. Its panic-free / Ok-via-CPU-fallback
527        // contract is now exercised by a regression test there
528        // (`weighted_crossprod_gpu_cpu_fallback_*`), not from gam-gpu, which
529        // cannot reach gam-solve without a dependency cycle.
530    }
531}