gam_gpu/
mod.rs

1// GPU acceleration support.
2//
3// Infrastructure modules live at this level and are intentionally callable
4// from CPU-only builds: all public entry points are available without CUDA,
5// and the runtime reports an unavailable backend instead of changing
6// numerical results. CUDA-specific code is compiled only for Linux builds that
7// enable the `cuda` feature, so cudarc is never loaded by default CPU-only
8// builds.
9
10// `gpu_error` is declared first so its `#[macro_use]` macros (`gpu_err!`,
11// `gpu_bail!`) are in textual scope for every module below — `backend_probe`
12// in particular calls `gpu_err!` unqualified. Referring to these
13// `#[macro_export]` macros by absolute path (`crate::gpu_err`) is rejected
14// here: `lib.rs` pulls this module tree in via `include!`, which makes every
15// exported macro "macro-expanded", and absolute-path access to those is a
16// denied future-incompat lint.
17#[macro_use]
18pub mod gpu_error;
19pub mod backend_probe;
20pub mod blas;
21#[cfg(target_os = "linux")]
22pub mod calibration;
23pub mod cpu_traits;
24pub mod device;
25pub mod device_cache;
26pub mod driver;
27pub mod device_runtime;
28pub mod linalg_dispatch;
29pub mod memory;
30pub mod numerics_device;
31pub mod numerics_host;
32pub mod policy;
33pub mod pool;
34pub mod profile;
35pub mod solver;
36
37// Domain-specific GPU kernels are isolated from the infrastructure modules.
38pub mod kernels;
39
40pub use cpu_traits::{ExecutionTarget, MatrixLocation};
41pub use device::GpuDeviceInfo;
42pub use device_runtime::GpuRuntime;
43pub use gpu_error::GpuError;
44pub use memory::{DeviceBuffer, DeviceCsrMatrix, DeviceMatrix, DeviceVector};
45pub use policy::{GpuDispatchPolicy, GpuMixedPrecisionPolicy};
46pub use pool::{balanced_partition, scatter_batched};
47pub use profile::{GpuExecutionTelemetry, KernelStat, KernelStatsSnapshot};
48
49// ---------------------------------------------------------------------------
50// User-facing policy and instrumentation hooks (formerly src/gpu.rs).
51//
52// The first production-safe step for acceleration is an explicit policy
53// layer: `Auto` may opportunistically use supported device-resident kernels,
54// `Off` guarantees the CPU path, and `Force` turns an unsupported GPU route
55// into a hard error instead of a silent CPU fallback. The numerical kernels
56// are wired to call these helpers before selecting a backend; until a vendor
57// backend is compiled in this module intentionally reports "unsupported" so
58// `force` fails loudly while `auto` remains a correct CPU fallback.
59// ---------------------------------------------------------------------------
60
61use serde::{Deserialize, Serialize};
62use std::fmt;
63use std::sync::OnceLock;
64
65#[derive(Clone, Copy, Debug, Eq, PartialEq)]
66pub enum CudaBackendStatus {
67    CudaUnavailable,
68    CudaReady,
69}
70
71#[inline]
72pub(crate) fn cuda_backend_status() -> CudaBackendStatus {
73    if device_runtime::GpuRuntime::global().is_some() {
74        CudaBackendStatus::CudaReady
75    } else {
76        CudaBackendStatus::CudaUnavailable
77    }
78}
79
80/// User-facing GPU backend policy.
81#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
82#[serde(rename_all = "kebab-case")]
83pub enum GpuPolicy {
84    /// Let the solver use GPU kernels only for supported, large-enough paths.
85    #[default]
86    Auto,
87    /// Always use CPU kernels.
88    Off,
89    /// Require GPU kernels and error if the requested path is unsupported.
90    Force,
91}
92
93impl GpuPolicy {
94    pub fn parse(raw: &str) -> Option<Self> {
95        match raw.trim().to_ascii_lowercase().as_str() {
96            "auto" => Some(Self::Auto),
97            "off" => Some(Self::Off),
98            "force" => Some(Self::Force),
99            _ => None,
100        }
101    }
102
103    #[inline]
104    pub const fn as_str(self) -> &'static str {
105        match self {
106            Self::Auto => "auto",
107            Self::Off => "off",
108            Self::Force => "force",
109        }
110    }
111
112    /// Whether unsupported GPU dispatch should be surfaced as a hard error.
113    #[inline]
114    pub const fn is_force(self) -> bool {
115        matches!(self, Self::Force)
116    }
117}
118
119impl fmt::Display for GpuPolicy {
120    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
121        f.write_str(self.as_str())
122    }
123}
124
125/// Fail-closed GPU residency mode (issue #1017).
126///
127/// Distinct from [`GpuPolicy`], which governs opportunistic per-kernel dispatch.
128/// `GpuMode` is the process-wide *residency contract* the resident solver
129/// consults through [`crate::device_runtime::GpuRuntime::global_or_fail`]:
130///
131/// * [`GpuMode::Auto`] — use the device when the probe admits it, fall back to
132///   CPU otherwise (the current, working behavior; preserved bit-for-bit).
133/// * [`GpuMode::Required`] — the device MUST be available; if the runtime is
134///   absent the resident path returns a structured error instead of silently
135///   running on the CPU. This is the fail-closed guard the reviewers asked for.
136/// * [`GpuMode::Off`] — never use the device.
137#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
138#[serde(rename_all = "kebab-case")]
139pub enum GpuMode {
140    /// Use the device when available; fall back to CPU otherwise.
141    #[default]
142    Auto,
143    /// Require the device; error (do not fall back) when it is unavailable.
144    Required,
145    /// Never use the device.
146    Off,
147}
148
149impl GpuMode {
150    /// Stable lowercase identifier.
151    #[inline]
152    pub const fn as_str(self) -> &'static str {
153        match self {
154            Self::Auto => "auto",
155            Self::Required => "required",
156            Self::Off => "off",
157        }
158    }
159}
160
161impl fmt::Display for GpuMode {
162    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
163        f.write_str(self.as_str())
164    }
165}
166
167static GPU_MODE: OnceLock<GpuMode> = OnceLock::new();
168
169/// Configure the process-wide GPU residency mode. First-writer-wins so
170/// concurrent fits cannot race the contract; a redundant late call is ignored.
171pub fn set_gpu_mode(mode: GpuMode) {
172    GPU_MODE.set(mode).ok();
173}
174
175/// Read the process-wide GPU residency mode. Defaults to [`GpuMode::Auto`]
176/// without claiming the slot, mirroring [`global_policy`] so an incidental
177/// read never locks the mode against a later explicit [`set_gpu_mode`].
178#[inline]
179pub fn gpu_mode() -> GpuMode {
180    match GPU_MODE.get() {
181        Some(m) => *m,
182        None => GpuMode::Auto,
183    }
184}
185
186#[derive(Clone, Copy, Debug, Eq, PartialEq)]
187pub enum GpuKernel {
188    DenseMatvec,
189    DenseTransposeMatvec,
190    DenseXtWX,
191    CandidateScreen,
192    DenseSolve,
193    MatrixFreePcg,
194    SparseAssembly,
195    SpatialKernelOperator,
196    MarginalSlopeRows,
197    RemlTrace,
198    FinalInference,
199}
200
201impl GpuKernel {
202    pub const fn as_str(self) -> &'static str {
203        match self {
204            Self::DenseMatvec => "dense-matvec",
205            Self::DenseTransposeMatvec => "dense-transpose-matvec",
206            Self::DenseXtWX => "dense-xtwx",
207            Self::CandidateScreen => "candidate-screen",
208            Self::DenseSolve => "dense-solve",
209            Self::MatrixFreePcg => "matrix-free-pcg",
210            Self::SparseAssembly => "sparse-assembly",
211            Self::SpatialKernelOperator => "spatial-kernel-operator",
212            Self::MarginalSlopeRows => "marginal-slope-rows",
213            Self::RemlTrace => "reml-trace",
214            Self::FinalInference => "final-inference",
215        }
216    }
217}
218
219/// A backend-selection decision for a single hot kernel.
220#[derive(Clone, Debug)]
221pub struct GpuDecision {
222    pub policy: GpuPolicy,
223    pub kernel: GpuKernel,
224    pub use_gpu: bool,
225    pub reason: &'static str,
226}
227
228static POLICY: OnceLock<GpuPolicy> = OnceLock::new();
229
230#[inline]
231pub fn global_policy() -> GpuPolicy {
232    // Reading the policy must NOT claim the OnceLock slot: returning the
233    // default `Auto` via `get_or_init` would race against an explicit
234    // `configure_global_policy(...)` made later in the same process and
235    // silently lock the policy to `Auto`.  Keep the slot uninitialized
236    // until explicitly configured so first-writer-wins applies only to
237    // genuine writes, not to incidental reads from probe/dispatch code.
238    match POLICY.get() {
239        Some(p) => *p,
240        None => GpuPolicy::Auto,
241    }
242}
243
244/// Configure the process-wide policy before solver kernels are selected.
245/// If a previous explicit configuration already set the policy, the first
246/// value wins so concurrent fits cannot race policy changes.  Reads of
247/// `global_policy()` never claim the slot, so the very first explicit
248/// configuration always sticks even if dispatch code observed the
249/// default `Auto` beforehand.
250pub fn configure_global_policy(policy: GpuPolicy) {
251    // First-writer-wins semantics; ignore a redundant late call.
252    POLICY.set(policy).ok();
253}
254
255/// True when direct solver GPU entry points should be attempted.
256///
257/// `Auto` attempts CUDA only after the runtime probe finds a usable device.
258/// `Off` pins the process to CPU. `Force` attempts the GPU path so missing
259/// runtime/backend support becomes an explicit error at the callee instead of
260/// an implicit CPU route.
261#[inline]
262pub fn cuda_selected() -> bool {
263    match global_policy() {
264        GpuPolicy::Auto => device_runtime::GpuRuntime::is_available(),
265        GpuPolicy::Off => false,
266        GpuPolicy::Force => true,
267    }
268}
269
270/// Joint eligibility state for a GPU kernel at the call site.
271///
272/// Callers construct exactly one variant, which encodes both the compile-time
273/// backend presence and the runtime workload threshold check.  Replacing the
274/// former `(supported: bool, large_enough: bool)` pair removes the possibility
275/// of silently swapping the two flags at a call site: each meaningful state
276/// has exactly one constructor and the `match` inside [`decide`] is total.
277#[derive(Clone, Copy, Debug, Eq, PartialEq)]
278pub enum GpuEligibility {
279    /// Vendor backend is not compiled into this build for this kernel.
280    BackendNotCompiled,
281    /// Backend is compiled in, but the workload (n, m, ...) is below the
282    /// runtime threshold for this kernel.
283    WorkloadBelowThreshold,
284    /// Backend is compiled in and the workload is large enough; the only
285    /// remaining gates are policy and runtime probe.
286    Eligible,
287}
288
289impl GpuEligibility {
290    /// Combine the compile-time backend flag with the workload predicate into
291    /// the canonical joint state.  Use this only when you genuinely have two
292    /// independent booleans; otherwise prefer constructing a variant directly.
293    #[inline]
294    pub const fn from_flags(supported: bool, large_enough: bool) -> Self {
295        if !supported {
296            Self::BackendNotCompiled
297        } else if !large_enough {
298            Self::WorkloadBelowThreshold
299        } else {
300            Self::Eligible
301        }
302    }
303}
304
305/// Decide whether a GPU kernel may run. This is deliberately conservative:
306/// with no compiled vendor backend, `auto` returns CPU fallback and `force`
307/// returns an error at the call site through [`GpuDecision::require_supported`].
308pub fn decide(kernel: GpuKernel, eligibility: GpuEligibility) -> GpuDecision {
309    let policy = global_policy();
310    // Auto must consult the actual probed runtime, not only the
311    // compile-time eligibility.  Without this, `decide()` would claim
312    // GPU when the kernel is "compiled in" even though `GpuRuntime::global()`
313    // observed no device — silently producing CPU work via failed dispatch
314    // and hiding the cpu_reason from callers wanting to log fallback cause.
315    let runtime_available = device_runtime::GpuRuntime::is_available();
316    let (use_gpu, reason) = match (policy, eligibility) {
317        (GpuPolicy::Off, _) => (false, "cpu-gpu-policy-off"),
318        (GpuPolicy::Auto, GpuEligibility::BackendNotCompiled) => {
319            (false, "cpu-gpu-backend-not-compiled")
320        }
321        (GpuPolicy::Auto, _) if !runtime_available => (false, "cpu-gpu-runtime-unavailable"),
322        (GpuPolicy::Auto, GpuEligibility::WorkloadBelowThreshold) => {
323            (false, "cpu-workload-below-gpu-threshold")
324        }
325        (GpuPolicy::Auto, GpuEligibility::Eligible) => (true, "gpu-auto-supported"),
326        (GpuPolicy::Force, GpuEligibility::BackendNotCompiled) => {
327            (false, "cpu-gpu-force-unsupported")
328        }
329        (GpuPolicy::Force, _) if !runtime_available => (false, "cpu-gpu-force-runtime-unavailable"),
330        // Under `force`, the workload-threshold gate is intentionally bypassed:
331        // the user explicitly asked for GPU regardless of size.
332        (GpuPolicy::Force, GpuEligibility::WorkloadBelowThreshold)
333        | (GpuPolicy::Force, GpuEligibility::Eligible) => (true, "gpu-force-supported"),
334    };
335    GpuDecision {
336        policy,
337        kernel,
338        use_gpu,
339        reason,
340    }
341}
342
343impl GpuDecision {
344    pub fn require_supported(&self) -> Result<(), String> {
345        if self.policy == GpuPolicy::Force && !self.use_gpu {
346            return Err(format!(
347                "gpu=force requested kernel '{}' but no supported device backend is available ({})",
348                self.kernel.as_str(),
349                self.reason
350            ));
351        }
352        Ok(())
353    }
354
355    pub fn log(self) {
356        log::debug!(
357            "[GPU backend] kernel={} policy={} selected={} reason={}",
358            self.kernel.as_str(),
359            self.policy.as_str(),
360            self.use_gpu,
361            self.reason
362        );
363    }
364}
365
366/// Emit the roadmap-visible kernels at startup/debug time without affecting
367/// numerical execution. This keeps backend coverage auditable as real device
368/// kernels are added incrementally.
369pub fn log_backend_inventory_once() {
370    static LOGGED: OnceLock<()> = OnceLock::new();
371    LOGGED.get_or_init(|| {
372        let compiled_backends = if cfg!(target_os = "linux") {
373            "cuda-dynamic"
374        } else {
375            "none"
376        };
377        log::debug!(
378            "[GPU backend] policy={} compiled_backends={} kernels=dense-matvec,dense-transpose-matvec,dense-xtwx,candidate-screen,dense-solve,matrix-free-pcg,sparse-assembly,spatial-kernel-operator,marginal-slope-rows,reml-trace,final-inference",
379            global_policy().as_str(),
380            compiled_backends
381        );
382    });
383}
384
385#[inline]
386pub fn try_fast_ab(
387    a: ndarray::ArrayView2<'_, f64>,
388    b: ndarray::ArrayView2<'_, f64>,
389) -> Option<ndarray::Array2<f64>> {
390    linalg_dispatch::try_fast_ab(a, b)
391}
392#[inline]
393pub fn try_fast_atb_on_ordinal(
394    ordinal: usize,
395    a: ndarray::ArrayView2<'_, f64>,
396    b: ndarray::ArrayView2<'_, f64>,
397) -> Option<ndarray::Array2<f64>> {
398    linalg_dispatch::try_fast_atb_on_ordinal(ordinal, a, b)
399}
400#[inline]
401pub fn try_fast_av(
402    a: ndarray::ArrayView2<'_, f64>,
403    v: ndarray::ArrayView1<'_, f64>,
404) -> Option<ndarray::Array1<f64>> {
405    linalg_dispatch::try_fast_av(a, v)
406}
407#[inline]
408pub fn try_fast_atv(
409    a: ndarray::ArrayView2<'_, f64>,
410    v: ndarray::ArrayView1<'_, f64>,
411) -> Option<ndarray::Array1<f64>> {
412    linalg_dispatch::try_fast_atv(a, v)
413}
414#[inline]
415pub fn try_fast_ab_broadcast_b_batched(
416    a: ndarray::ArrayView3<'_, f64>,
417    b: ndarray::ArrayView2<'_, f64>,
418) -> Option<ndarray::Array3<f64>> {
419    linalg_dispatch::try_fast_ab_broadcast_b_batched(a, b)
420}
421#[inline]
422pub fn try_fast_abt_strided_batched(
423    a: ndarray::ArrayView3<'_, f64>,
424    b: ndarray::ArrayView3<'_, f64>,
425) -> Option<ndarray::Array3<f64>> {
426    linalg_dispatch::try_fast_abt_strided_batched(a, b)
427}
428#[inline]
429pub fn try_cholesky_lower_inplace(a: &mut ndarray::Array2<f64>) -> Option<()> {
430    linalg_dispatch::try_cholesky_lower_inplace(a)
431}
432#[inline]
433pub fn try_cholesky_batched_lower_inplace(matrices: &mut [ndarray::Array2<f64>]) -> Option<()> {
434    linalg_dispatch::try_cholesky_batched_lower_inplace(matrices)
435}
436#[inline]
437pub fn try_solve_lower_triangular_matrix(
438    lower: ndarray::ArrayView2<'_, f64>,
439    rhs: ndarray::ArrayView2<'_, f64>,
440) -> Option<ndarray::Array2<f64>> {
441    linalg_dispatch::try_solve_lower_triangular_matrix(lower, rhs)
442}
443#[inline]
444pub fn try_solve_upper_triangular_matrix(
445    upper: ndarray::ArrayView2<'_, f64>,
446    rhs: ndarray::ArrayView2<'_, f64>,
447) -> Option<ndarray::Array2<f64>> {
448    linalg_dispatch::try_solve_upper_triangular_matrix(upper, rhs)
449}
450#[cfg(test)]
451mod policy_tests {
452    use super::*;
453
454    #[test]
455    fn parses_canonical_user_gpu_policy_values() {
456        assert_eq!(GpuPolicy::parse("auto"), Some(GpuPolicy::Auto));
457        assert_eq!(GpuPolicy::parse("off"), Some(GpuPolicy::Off));
458        assert_eq!(GpuPolicy::parse("force"), Some(GpuPolicy::Force));
459        assert_eq!(GpuPolicy::parse("cpu"), None);
460        assert_eq!(GpuPolicy::parse(""), None);
461        assert_eq!(GpuPolicy::parse("wat"), None);
462    }
463
464    #[test]
465    fn execution_path_defaults_to_cpu() {
466        use gam_problem::ExecutionPath;
467        // The truthful execution-path classifier must default to the CPU path,
468        // so a result struct that is never told otherwise cannot claim the
469        // device (the original `used_device: bool` defaulted the same way, but
470        // now the "no device" state is a named, non-lying variant).
471        assert_eq!(ExecutionPath::default(), ExecutionPath::Cpu);
472        assert!(!ExecutionPath::Cpu.used_device());
473        assert!(ExecutionPath::GpuResidentFull.used_device());
474    }
475
476    #[test]
477    fn gpu_mode_required_fails_closed_when_device_absent() {
478        use crate::device_runtime::GpuRuntime;
479        // Off always refuses, regardless of hardware.
480        assert!(matches!(
481            GpuRuntime::global_or_fail(GpuMode::Off),
482            Err(GpuError::DriverLibraryUnavailable { .. })
483        ));
484
485        if GpuRuntime::is_available() {
486            // On a GPU host both Auto and Required must succeed.
487            assert!(GpuRuntime::global_or_fail(GpuMode::Required).is_ok());
488            assert!(GpuRuntime::global_or_fail(GpuMode::Auto).is_ok());
489        } else {
490            // Fail-closed: Required surfaces a STRUCTURED error rather than a
491            // silent CPU fallback. Auto also reports unavailable (callers there
492            // swallow it and fall back), but the variant is what lets Required
493            // propagate it as fatal.
494            let required = GpuRuntime::global_or_fail(GpuMode::Required);
495            assert!(
496                matches!(required, Err(GpuError::DriverLibraryUnavailable { .. })),
497                "GpuMode::Required must fail closed when the device is absent, got {required:?}"
498            );
499            assert!(GpuRuntime::global_or_fail(GpuMode::Auto).is_err());
500        }
501    }
502
503    #[test]
504    fn pirls_loop_admission_requires_runtime_size_and_known_family() {
505        use crate::policy::{PirlsLoopAdmission, PirlsLoopCurvatureKind, PirlsLoopFamilyKind};
506        let pol = GpuDispatchPolicy::default();
507        let base = PirlsLoopAdmission {
508            n: 80_000,
509            p: 44,
510            family: Some(PirlsLoopFamilyKind::BernoulliLogit),
511            curvature: PirlsLoopCurvatureKind::Fisher,
512            gpu_available: true,
513        };
514        assert!(pol.should_use_gpu_pirls_loop(base));
515        // No runtime → never dispatch.
516        assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission {
517            gpu_available: false,
518            ..base
519        }));
520        // Below dense-work floor.
521        assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission { n: 1_000, ..base }));
522        // Small n with large p is admitted because 2*n*p^2 clears the work floor.
523        assert!(pol.should_use_gpu_pirls_loop(PirlsLoopAdmission {
524            n: 2_000,
525            p: 2_048,
526            ..base
527        }));
528        // Below column floor.
529        assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission { p: 8, ..base }));
530        // Custom family (not in 6 JIT-cached set) declines.
531        assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission {
532            family: None,
533            ..base
534        }));
535    }
536
537    #[test]
538    fn force_policy_reports_unsupported_kernel() {
539        let decision = GpuDecision {
540            policy: GpuPolicy::Force,
541            kernel: GpuKernel::DenseXtWX,
542            use_gpu: false,
543            reason: "gpu-force-unsupported",
544        };
545        let err = decision.require_supported().unwrap_err();
546        assert!(err.contains("dense-xtwx"));
547        assert!(err.contains("gpu=force"));
548    }
549}
gam_gpu/mod.rs

gam_gpu/
mod.rs