gam_gpu/
mod.rs

1// GPU acceleration support.
2//
3// Infrastructure modules live at this level and are intentionally callable
4// from CPU-only builds: all public entry points are available without CUDA,
5// and the runtime reports an unavailable backend instead of changing
6// numerical results. CUDA-specific code is compiled only for Linux builds that
7// enable the `cuda` feature, so cudarc is never loaded by default CPU-only
8// builds.
9
10// `gpu_error` is declared first so its `#[macro_use]` macros (`gpu_err!`,
11// `gpu_bail!`) are in textual scope for every module below — `backend_probe`
12// in particular calls `gpu_err!` unqualified. Referring to these
13// `#[macro_export]` macros by absolute path (`crate::gpu_err`) is rejected
14// here: `lib.rs` pulls this module tree in via `include!`, which makes every
15// exported macro "macro-expanded", and absolute-path access to those is a
16// denied future-incompat lint.
17#[macro_use]
18pub mod gpu_error;
19pub mod backend_probe;
20pub mod blas;
21#[cfg(target_os = "linux")]
22pub mod calibration;
23pub mod cpu_traits;
24pub mod device;
25pub mod device_cache;
26pub mod driver;
27pub mod device_runtime;
28pub mod encode_throughput;
29pub mod linalg_dispatch;
30pub mod memory;
31pub mod numerics_device;
32pub mod numerics_host;
33pub mod policy;
34pub mod pool;
35pub mod profile;
36pub mod solver;
37
38// Domain-specific GPU kernels are isolated from the infrastructure modules.
39pub mod kernels;
40
41pub use cpu_traits::MatrixLocation;
42pub use device::GpuDeviceInfo;
43pub use device_runtime::GpuRuntime;
44pub use gpu_error::GpuError;
45pub use memory::{DeviceBuffer, DeviceCsrMatrix, DeviceMatrix, DeviceVector};
46pub use policy::{GpuDispatchPolicy, GpuMixedPrecisionPolicy};
47pub use pool::{balanced_partition, scatter_batched};
48pub use profile::{GpuExecutionTelemetry, KernelStat, KernelStatsSnapshot};
49
50// ---------------------------------------------------------------------------
51// User-facing policy and instrumentation hooks (formerly src/gpu.rs).
52//
53// The first production-safe step for acceleration is an explicit policy
54// layer: `Auto` may opportunistically use supported device-resident kernels,
55// `Off` guarantees the CPU path, and `Force` turns an unsupported GPU route
56// into a hard error instead of a silent CPU fallback. The numerical kernels
57// are wired to call these helpers before selecting a backend; until a vendor
58// backend is compiled in this module intentionally reports "unsupported" so
59// `force` fails loudly while `auto` remains a correct CPU fallback.
60// ---------------------------------------------------------------------------
61
62use serde::{Deserialize, Serialize};
63use std::fmt;
64use std::sync::OnceLock;
65
66#[derive(Clone, Copy, Debug, Eq, PartialEq)]
67pub enum CudaBackendStatus {
68    CudaUnavailable,
69    CudaReady,
70}
71
72#[inline]
73pub(crate) fn cuda_backend_status() -> CudaBackendStatus {
74    if device_runtime::GpuRuntime::global().is_some() {
75        CudaBackendStatus::CudaReady
76    } else {
77        CudaBackendStatus::CudaUnavailable
78    }
79}
80
81/// User-facing GPU backend policy.
82#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
83#[serde(rename_all = "kebab-case")]
84pub enum GpuPolicy {
85    /// Let the solver use GPU kernels only for supported, large-enough paths.
86    #[default]
87    Auto,
88    /// Always use CPU kernels.
89    Off,
90    /// Require GPU kernels and error if the requested path is unsupported.
91    Force,
92}
93
94impl GpuPolicy {
95    pub fn parse(raw: &str) -> Option<Self> {
96        match raw.trim().to_ascii_lowercase().as_str() {
97            "auto" => Some(Self::Auto),
98            "off" => Some(Self::Off),
99            "force" => Some(Self::Force),
100            _ => None,
101        }
102    }
103
104    #[inline]
105    pub const fn as_str(self) -> &'static str {
106        match self {
107            Self::Auto => "auto",
108            Self::Off => "off",
109            Self::Force => "force",
110        }
111    }
112
113    /// Whether unsupported GPU dispatch should be surfaced as a hard error.
114    #[inline]
115    pub const fn is_force(self) -> bool {
116        matches!(self, Self::Force)
117    }
118}
119
120impl fmt::Display for GpuPolicy {
121    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
122        f.write_str(self.as_str())
123    }
124}
125
126/// Fail-closed GPU residency mode (issue #1017).
127///
128/// Distinct from [`GpuPolicy`], which governs opportunistic per-kernel dispatch.
129/// `GpuMode` is the process-wide *residency contract* the resident solver
130/// consults through [`crate::device_runtime::GpuRuntime::global_or_fail`]:
131///
132/// * [`GpuMode::Auto`] — use the device when the probe admits it, fall back to
133///   CPU otherwise (the current, working behavior; preserved bit-for-bit).
134/// * [`GpuMode::Required`] — the device MUST be available; if the runtime is
135///   absent the resident path returns a structured error instead of silently
136///   running on the CPU. This is the fail-closed guard the reviewers asked for.
137/// * [`GpuMode::Off`] — never use the device.
138#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
139#[serde(rename_all = "kebab-case")]
140pub enum GpuMode {
141    /// Use the device when available; fall back to CPU otherwise.
142    #[default]
143    Auto,
144    /// Require the device; error (do not fall back) when it is unavailable.
145    Required,
146    /// Never use the device.
147    Off,
148}
149
150impl GpuMode {
151    /// Stable lowercase identifier.
152    #[inline]
153    pub const fn as_str(self) -> &'static str {
154        match self {
155            Self::Auto => "auto",
156            Self::Required => "required",
157            Self::Off => "off",
158        }
159    }
160}
161
162impl fmt::Display for GpuMode {
163    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
164        f.write_str(self.as_str())
165    }
166}
167
168static GPU_MODE: OnceLock<GpuMode> = OnceLock::new();
169
170/// Configure the process-wide GPU residency mode. First-writer-wins so
171/// concurrent fits cannot race the contract; a redundant late call is ignored.
172pub fn set_gpu_mode(mode: GpuMode) {
173    GPU_MODE.set(mode).ok();
174}
175
176/// Read the process-wide GPU residency mode. Defaults to [`GpuMode::Auto`]
177/// without claiming the slot, mirroring [`global_policy`] so an incidental
178/// read never locks the mode against a later explicit [`set_gpu_mode`].
179#[inline]
180pub fn gpu_mode() -> GpuMode {
181    match GPU_MODE.get() {
182        Some(m) => *m,
183        None => GpuMode::Auto,
184    }
185}
186
187#[derive(Clone, Copy, Debug, Eq, PartialEq)]
188pub enum GpuKernel {
189    DenseMatvec,
190    DenseTransposeMatvec,
191    DenseXtWX,
192    CandidateScreen,
193    DenseSolve,
194    MatrixFreePcg,
195    SparseAssembly,
196    SpatialKernelOperator,
197    MarginalSlopeRows,
198    RemlTrace,
199    FinalInference,
200}
201
202impl GpuKernel {
203    pub const fn as_str(self) -> &'static str {
204        match self {
205            Self::DenseMatvec => "dense-matvec",
206            Self::DenseTransposeMatvec => "dense-transpose-matvec",
207            Self::DenseXtWX => "dense-xtwx",
208            Self::CandidateScreen => "candidate-screen",
209            Self::DenseSolve => "dense-solve",
210            Self::MatrixFreePcg => "matrix-free-pcg",
211            Self::SparseAssembly => "sparse-assembly",
212            Self::SpatialKernelOperator => "spatial-kernel-operator",
213            Self::MarginalSlopeRows => "marginal-slope-rows",
214            Self::RemlTrace => "reml-trace",
215            Self::FinalInference => "final-inference",
216        }
217    }
218}
219
220/// A backend-selection decision for a single hot kernel.
221#[derive(Clone, Debug)]
222pub struct GpuDecision {
223    pub policy: GpuPolicy,
224    pub kernel: GpuKernel,
225    pub use_gpu: bool,
226    pub reason: &'static str,
227}
228
229static POLICY: OnceLock<GpuPolicy> = OnceLock::new();
230
231#[inline]
232pub fn global_policy() -> GpuPolicy {
233    // Reading the policy must NOT claim the OnceLock slot: returning the
234    // default `Auto` via `get_or_init` would race against an explicit
235    // `configure_global_policy(...)` made later in the same process and
236    // silently lock the policy to `Auto`.  Keep the slot uninitialized
237    // until explicitly configured so first-writer-wins applies only to
238    // genuine writes, not to incidental reads from probe/dispatch code.
239    match POLICY.get() {
240        Some(p) => *p,
241        None => GpuPolicy::Auto,
242    }
243}
244
245/// Configure the process-wide policy before solver kernels are selected.
246/// If a previous explicit configuration already set the policy, the first
247/// value wins so concurrent fits cannot race policy changes.  Reads of
248/// `global_policy()` never claim the slot, so the very first explicit
249/// configuration always sticks even if dispatch code observed the
250/// default `Auto` beforehand.
251pub fn configure_global_policy(policy: GpuPolicy) {
252    // First-writer-wins semantics; ignore a redundant late call.
253    POLICY.set(policy).ok();
254}
255
256/// True when direct solver GPU entry points should be attempted.
257///
258/// `Auto` attempts CUDA only after the runtime probe finds a usable device.
259/// `Off` pins the process to CPU. `Force` attempts the GPU path so missing
260/// runtime/backend support becomes an explicit error at the callee instead of
261/// an implicit CPU route.
262#[inline]
263pub fn cuda_selected() -> bool {
264    match global_policy() {
265        GpuPolicy::Auto => device_runtime::GpuRuntime::is_available(),
266        GpuPolicy::Off => false,
267        GpuPolicy::Force => true,
268    }
269}
270
271/// Joint eligibility state for a GPU kernel at the call site.
272///
273/// Callers construct exactly one variant, which encodes both the compile-time
274/// backend presence and the runtime workload threshold check.  Replacing the
275/// former `(supported: bool, large_enough: bool)` pair removes the possibility
276/// of silently swapping the two flags at a call site: each meaningful state
277/// has exactly one constructor and the `match` inside [`decide`] is total.
278#[derive(Clone, Copy, Debug, Eq, PartialEq)]
279pub enum GpuEligibility {
280    /// Vendor backend is not compiled into this build for this kernel.
281    BackendNotCompiled,
282    /// Backend is compiled in, but the workload (n, m, ...) is below the
283    /// runtime threshold for this kernel.
284    WorkloadBelowThreshold,
285    /// Backend is compiled in and the workload is large enough; the only
286    /// remaining gates are policy and runtime probe.
287    Eligible,
288}
289
290impl GpuEligibility {
291    /// Combine the compile-time backend flag with the workload predicate into
292    /// the canonical joint state.  Use this only when you genuinely have two
293    /// independent booleans; otherwise prefer constructing a variant directly.
294    #[inline]
295    pub const fn from_flags(supported: bool, large_enough: bool) -> Self {
296        if !supported {
297            Self::BackendNotCompiled
298        } else if !large_enough {
299            Self::WorkloadBelowThreshold
300        } else {
301            Self::Eligible
302        }
303    }
304}
305
306/// Decide whether a GPU kernel may run. This is deliberately conservative:
307/// with no compiled vendor backend, `auto` returns CPU fallback and `force`
308/// returns an error at the call site through [`GpuDecision::require_supported`].
309pub fn decide(kernel: GpuKernel, eligibility: GpuEligibility) -> GpuDecision {
310    let policy = global_policy();
311    // Auto must consult the actual probed runtime, not only the
312    // compile-time eligibility.  Without this, `decide()` would claim
313    // GPU when the kernel is "compiled in" even though `GpuRuntime::global()`
314    // observed no device — silently producing CPU work via failed dispatch
315    // and hiding the cpu_reason from callers wanting to log fallback cause.
316    let runtime_available = device_runtime::GpuRuntime::is_available();
317    let (use_gpu, reason) = match (policy, eligibility) {
318        (GpuPolicy::Off, _) => (false, "cpu-gpu-policy-off"),
319        (GpuPolicy::Auto, GpuEligibility::BackendNotCompiled) => {
320            (false, "cpu-gpu-backend-not-compiled")
321        }
322        (GpuPolicy::Auto, _) if !runtime_available => (false, "cpu-gpu-runtime-unavailable"),
323        (GpuPolicy::Auto, GpuEligibility::WorkloadBelowThreshold) => {
324            (false, "cpu-workload-below-gpu-threshold")
325        }
326        (GpuPolicy::Auto, GpuEligibility::Eligible) => (true, "gpu-auto-supported"),
327        (GpuPolicy::Force, GpuEligibility::BackendNotCompiled) => {
328            (false, "cpu-gpu-force-unsupported")
329        }
330        (GpuPolicy::Force, _) if !runtime_available => (false, "cpu-gpu-force-runtime-unavailable"),
331        // Under `force`, the workload-threshold gate is intentionally bypassed:
332        // the user explicitly asked for GPU regardless of size.
333        (GpuPolicy::Force, GpuEligibility::WorkloadBelowThreshold)
334        | (GpuPolicy::Force, GpuEligibility::Eligible) => (true, "gpu-force-supported"),
335    };
336    GpuDecision {
337        policy,
338        kernel,
339        use_gpu,
340        reason,
341    }
342}
343
344impl GpuDecision {
345    pub fn require_supported(&self) -> Result<(), String> {
346        if self.policy == GpuPolicy::Force && !self.use_gpu {
347            return Err(format!(
348                "gpu=force requested kernel '{}' but no supported device backend is available ({})",
349                self.kernel.as_str(),
350                self.reason
351            ));
352        }
353        Ok(())
354    }
355
356    pub fn log(self) {
357        log::debug!(
358            "[GPU backend] kernel={} policy={} selected={} reason={}",
359            self.kernel.as_str(),
360            self.policy.as_str(),
361            self.use_gpu,
362            self.reason
363        );
364    }
365}
366
367/// Emit the roadmap-visible kernels at startup/debug time without affecting
368/// numerical execution. This keeps backend coverage auditable as real device
369/// kernels are added incrementally.
370pub fn log_backend_inventory_once() {
371    static LOGGED: OnceLock<()> = OnceLock::new();
372    LOGGED.get_or_init(|| {
373        let compiled_backends = if cfg!(target_os = "linux") {
374            "cuda-dynamic"
375        } else {
376            "none"
377        };
378        log::debug!(
379            "[GPU backend] policy={} compiled_backends={} kernels=dense-matvec,dense-transpose-matvec,dense-xtwx,candidate-screen,dense-solve,matrix-free-pcg,sparse-assembly,spatial-kernel-operator,marginal-slope-rows,reml-trace,final-inference",
380            global_policy().as_str(),
381            compiled_backends
382        );
383    });
384}
385
386#[inline]
387pub fn try_fast_ab(
388    a: ndarray::ArrayView2<'_, f64>,
389    b: ndarray::ArrayView2<'_, f64>,
390) -> Option<ndarray::Array2<f64>> {
391    linalg_dispatch::try_fast_ab(a, b)
392}
393#[inline]
394pub fn try_fast_atb_on_ordinal(
395    ordinal: usize,
396    a: ndarray::ArrayView2<'_, f64>,
397    b: ndarray::ArrayView2<'_, f64>,
398) -> Option<ndarray::Array2<f64>> {
399    linalg_dispatch::try_fast_atb_on_ordinal(ordinal, a, b)
400}
401#[inline]
402pub fn try_fast_av(
403    a: ndarray::ArrayView2<'_, f64>,
404    v: ndarray::ArrayView1<'_, f64>,
405) -> Option<ndarray::Array1<f64>> {
406    linalg_dispatch::try_fast_av(a, v)
407}
408#[inline]
409pub fn try_fast_atv(
410    a: ndarray::ArrayView2<'_, f64>,
411    v: ndarray::ArrayView1<'_, f64>,
412) -> Option<ndarray::Array1<f64>> {
413    linalg_dispatch::try_fast_atv(a, v)
414}
415#[inline]
416pub fn try_fast_ab_broadcast_b_batched(
417    a: ndarray::ArrayView3<'_, f64>,
418    b: ndarray::ArrayView2<'_, f64>,
419) -> Option<ndarray::Array3<f64>> {
420    linalg_dispatch::try_fast_ab_broadcast_b_batched(a, b)
421}
422#[inline]
423pub fn try_fast_abt_strided_batched(
424    a: ndarray::ArrayView3<'_, f64>,
425    b: ndarray::ArrayView3<'_, f64>,
426) -> Option<ndarray::Array3<f64>> {
427    linalg_dispatch::try_fast_abt_strided_batched(a, b)
428}
429#[inline]
430pub fn try_cholesky_lower_inplace(a: &mut ndarray::Array2<f64>) -> Option<()> {
431    linalg_dispatch::try_cholesky_lower_inplace(a)
432}
433#[inline]
434pub fn try_cholesky_batched_lower_inplace(matrices: &mut [ndarray::Array2<f64>]) -> Option<()> {
435    linalg_dispatch::try_cholesky_batched_lower_inplace(matrices)
436}
437#[inline]
438pub fn try_solve_lower_triangular_matrix(
439    lower: ndarray::ArrayView2<'_, f64>,
440    rhs: ndarray::ArrayView2<'_, f64>,
441) -> Option<ndarray::Array2<f64>> {
442    linalg_dispatch::try_solve_lower_triangular_matrix(lower, rhs)
443}
444#[inline]
445pub fn try_solve_upper_triangular_matrix(
446    upper: ndarray::ArrayView2<'_, f64>,
447    rhs: ndarray::ArrayView2<'_, f64>,
448) -> Option<ndarray::Array2<f64>> {
449    linalg_dispatch::try_solve_upper_triangular_matrix(upper, rhs)
450}
451#[cfg(test)]
452mod policy_tests {
453    use super::*;
454
455    #[test]
456    fn parses_canonical_user_gpu_policy_values() {
457        assert_eq!(GpuPolicy::parse("auto"), Some(GpuPolicy::Auto));
458        assert_eq!(GpuPolicy::parse("off"), Some(GpuPolicy::Off));
459        assert_eq!(GpuPolicy::parse("force"), Some(GpuPolicy::Force));
460        assert_eq!(GpuPolicy::parse("cpu"), None);
461        assert_eq!(GpuPolicy::parse(""), None);
462        assert_eq!(GpuPolicy::parse("wat"), None);
463    }
464
465    #[test]
466    fn execution_path_defaults_to_cpu() {
467        use gam_problem::ExecutionPath;
468        // The truthful execution-path classifier must default to the CPU path,
469        // so a result struct that is never told otherwise cannot claim the
470        // device (the original `used_device: bool` defaulted the same way, but
471        // now the "no device" state is a named, non-lying variant).
472        assert_eq!(ExecutionPath::default(), ExecutionPath::Cpu);
473        assert!(!ExecutionPath::Cpu.used_device());
474        assert!(ExecutionPath::GpuResidentFull.used_device());
475    }
476
477    #[test]
478    fn gpu_mode_required_fails_closed_when_device_absent() {
479        use crate::device_runtime::GpuRuntime;
480        // Off always refuses, regardless of hardware.
481        assert!(matches!(
482            GpuRuntime::global_or_fail(GpuMode::Off),
483            Err(GpuError::DriverLibraryUnavailable { .. })
484        ));
485
486        if GpuRuntime::is_available() {
487            // On a GPU host both Auto and Required must succeed.
488            assert!(GpuRuntime::global_or_fail(GpuMode::Required).is_ok());
489            assert!(GpuRuntime::global_or_fail(GpuMode::Auto).is_ok());
490        } else {
491            // Fail-closed: Required surfaces a STRUCTURED error rather than a
492            // silent CPU fallback. Auto also reports unavailable (callers there
493            // swallow it and fall back), but the variant is what lets Required
494            // propagate it as fatal.
495            let required = GpuRuntime::global_or_fail(GpuMode::Required);
496            assert!(
497                matches!(required, Err(GpuError::DriverLibraryUnavailable { .. })),
498                "GpuMode::Required must fail closed when the device is absent, got {required:?}"
499            );
500            assert!(GpuRuntime::global_or_fail(GpuMode::Auto).is_err());
501        }
502    }
503
504    #[test]
505    fn pirls_loop_admission_requires_runtime_size_and_known_family() {
506        use crate::policy::{PirlsLoopAdmission, PirlsLoopCurvatureKind, PirlsLoopFamilyKind};
507        let pol = GpuDispatchPolicy::default();
508        let base = PirlsLoopAdmission {
509            n: 80_000,
510            p: 44,
511            family: Some(PirlsLoopFamilyKind::BernoulliLogit),
512            curvature: PirlsLoopCurvatureKind::Fisher,
513            gpu_available: true,
514        };
515        assert!(pol.should_use_gpu_pirls_loop(base));
516        // No runtime → never dispatch.
517        assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission {
518            gpu_available: false,
519            ..base
520        }));
521        // Below dense-work floor.
522        assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission { n: 1_000, ..base }));
523        // Small n with large p is admitted because 2*n*p^2 clears the work floor.
524        assert!(pol.should_use_gpu_pirls_loop(PirlsLoopAdmission {
525            n: 2_000,
526            p: 2_048,
527            ..base
528        }));
529        // Below column floor.
530        assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission { p: 8, ..base }));
531        // Custom family (not in 6 JIT-cached set) declines.
532        assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission {
533            family: None,
534            ..base
535        }));
536    }
537
538    #[test]
539    fn force_policy_reports_unsupported_kernel() {
540        let decision = GpuDecision {
541            policy: GpuPolicy::Force,
542            kernel: GpuKernel::DenseXtWX,
543            use_gpu: false,
544            reason: "gpu-force-unsupported",
545        };
546        let err = decision.require_supported().unwrap_err();
547        assert!(err.contains("dense-xtwx"));
548        assert!(err.contains("gpu=force"));
549    }
550}
gam_gpu/mod.rs

gam_gpu/
mod.rs