Skip to main content

gam_gpu/
mod.rs

1// GPU acceleration support.
2//
3// Infrastructure modules live at this level and are intentionally callable
4// from CPU-only builds: all public entry points are available without CUDA,
5// and the runtime reports an unavailable backend instead of changing
6// numerical results. CUDA-specific code is compiled only for Linux builds that
7// enable the `cuda` feature, so cudarc is never loaded by default CPU-only
8// builds.
9
10// `gpu_error` is declared first so its `#[macro_use]` macros (`gpu_err!`,
11// `gpu_bail!`) are in textual scope for every module below — `backend_probe`
12// in particular calls `gpu_err!` unqualified. Referring to these
13// `#[macro_export]` macros by absolute path (`crate::gpu_err`) is rejected
14// here: `lib.rs` pulls this module tree in via `include!`, which makes every
15// exported macro "macro-expanded", and absolute-path access to those is a
16// denied future-incompat lint.
17#[macro_use]
18pub mod gpu_error;
19pub mod backend_probe;
20pub mod blas;
21#[cfg(target_os = "linux")]
22pub mod calibration;
23pub mod cpu_traits;
24pub mod device;
25pub mod device_cache;
26pub mod device_runtime;
27pub mod dictionary_score;
28pub mod driver;
29pub mod encode_throughput;
30pub mod linalg_dispatch;
31pub mod memory;
32pub mod numerics_device;
33pub mod numerics_host;
34pub mod policy;
35pub mod pool;
36pub mod profile;
37pub mod solver;
38
39// Domain-specific GPU kernels are isolated from the infrastructure modules.
40pub mod kernels;
41
42pub use cpu_traits::MatrixLocation;
43pub use device::GpuDeviceInfo;
44pub use device_runtime::GpuRuntime;
45pub use dictionary_score::{
46    DEFAULT_DICTIONARY_SCORE_MIN_ELEMS, DEFAULT_DICTIONARY_SCORE_TILE_ELEMS,
47    DictionaryScoreRoutePlan,
48};
49pub use gpu_error::GpuError;
50pub use memory::{DeviceBuffer, DeviceCsrMatrix, DeviceMatrix, DeviceVector};
51pub use policy::{GpuDispatchPolicy, GpuMixedPrecisionPolicy};
52pub use pool::{balanced_partition, scatter_batched};
53pub use profile::{GpuExecutionTelemetry, KernelStat, KernelStatsSnapshot};
54
55// ---------------------------------------------------------------------------
56// User-facing policy and instrumentation hooks (formerly src/gpu.rs).
57//
58// The first production-safe step for acceleration is an explicit policy
59// layer: `Auto` may opportunistically use supported device-resident kernels,
60// `Off` guarantees the CPU path, and `Force` turns an unsupported GPU route
61// into a hard error instead of a silent CPU fallback. The numerical kernels
62// are wired to call these helpers before selecting a backend; until a vendor
63// backend is compiled in this module intentionally reports "unsupported" so
64// `force` fails loudly while `auto` remains a correct CPU fallback.
65// ---------------------------------------------------------------------------
66
67use serde::{Deserialize, Serialize};
68use std::fmt;
69use std::sync::OnceLock;
70
71#[derive(Clone, Copy, Debug, Eq, PartialEq)]
72pub enum CudaBackendStatus {
73    CudaUnavailable,
74    CudaReady,
75}
76
77#[inline]
78pub(crate) fn cuda_backend_status() -> CudaBackendStatus {
79    if device_runtime::GpuRuntime::global().is_some() {
80        CudaBackendStatus::CudaReady
81    } else {
82        CudaBackendStatus::CudaUnavailable
83    }
84}
85
86/// User-facing GPU backend policy.
87#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
88#[serde(rename_all = "kebab-case")]
89pub enum GpuPolicy {
90    /// Let the solver use GPU kernels only for supported, large-enough paths.
91    #[default]
92    Auto,
93    /// Always use CPU kernels.
94    Off,
95    /// Require GPU kernels and error if the requested path is unsupported.
96    Force,
97}
98
99impl GpuPolicy {
100    pub fn parse(raw: &str) -> Option<Self> {
101        match raw.trim().to_ascii_lowercase().as_str() {
102            "auto" => Some(Self::Auto),
103            "off" => Some(Self::Off),
104            "force" => Some(Self::Force),
105            _ => None,
106        }
107    }
108
109    #[inline]
110    pub const fn as_str(self) -> &'static str {
111        match self {
112            Self::Auto => "auto",
113            Self::Off => "off",
114            Self::Force => "force",
115        }
116    }
117
118    /// Whether unsupported GPU dispatch should be surfaced as a hard error.
119    #[inline]
120    pub const fn is_force(self) -> bool {
121        matches!(self, Self::Force)
122    }
123}
124
125impl fmt::Display for GpuPolicy {
126    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
127        f.write_str(self.as_str())
128    }
129}
130
131/// Fail-closed GPU residency mode (issue #1017).
132///
133/// Distinct from [`GpuPolicy`], which governs opportunistic per-kernel dispatch.
134/// `GpuMode` is the process-wide *residency contract* the resident solver
135/// consults through [`crate::device_runtime::GpuRuntime::global_or_fail`]:
136///
137/// * [`GpuMode::Auto`] — use the device when the probe admits it, fall back to
138///   CPU otherwise (the current, working behavior; preserved bit-for-bit).
139/// * [`GpuMode::Required`] — the device MUST be available; if the runtime is
140///   absent the resident path returns a structured error instead of silently
141///   running on the CPU. This is the fail-closed guard the reviewers asked for.
142/// * [`GpuMode::Off`] — never use the device.
143#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
144#[serde(rename_all = "kebab-case")]
145pub enum GpuMode {
146    /// Use the device when available; fall back to CPU otherwise.
147    #[default]
148    Auto,
149    /// Require the device; error (do not fall back) when it is unavailable.
150    Required,
151    /// Never use the device.
152    Off,
153}
154
155impl GpuMode {
156    /// Stable lowercase identifier.
157    #[inline]
158    pub const fn as_str(self) -> &'static str {
159        match self {
160            Self::Auto => "auto",
161            Self::Required => "required",
162            Self::Off => "off",
163        }
164    }
165}
166
167impl fmt::Display for GpuMode {
168    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
169        f.write_str(self.as_str())
170    }
171}
172
173static GPU_MODE: OnceLock<GpuMode> = OnceLock::new();
174
175/// Configure the process-wide GPU residency mode. First-writer-wins so
176/// concurrent fits cannot race the contract; a redundant late call is ignored.
177pub fn set_gpu_mode(mode: GpuMode) {
178    GPU_MODE.set(mode).ok();
179}
180
181/// Read the process-wide GPU residency mode. Defaults to [`GpuMode::Auto`]
182/// without claiming the slot, mirroring [`global_policy`] so an incidental
183/// read never locks the mode against a later explicit [`set_gpu_mode`].
184#[inline]
185pub fn gpu_mode() -> GpuMode {
186    match GPU_MODE.get() {
187        Some(m) => *m,
188        None => GpuMode::Auto,
189    }
190}
191
192#[derive(Clone, Copy, Debug, Eq, PartialEq)]
193pub enum GpuKernel {
194    DenseMatvec,
195    DenseTransposeMatvec,
196    DenseXtWX,
197    CandidateScreen,
198    DenseSolve,
199    MatrixFreePcg,
200    SparseAssembly,
201    SpatialKernelOperator,
202    MarginalSlopeRows,
203    RemlTrace,
204    FinalInference,
205}
206
207impl GpuKernel {
208    pub const fn as_str(self) -> &'static str {
209        match self {
210            Self::DenseMatvec => "dense-matvec",
211            Self::DenseTransposeMatvec => "dense-transpose-matvec",
212            Self::DenseXtWX => "dense-xtwx",
213            Self::CandidateScreen => "candidate-screen",
214            Self::DenseSolve => "dense-solve",
215            Self::MatrixFreePcg => "matrix-free-pcg",
216            Self::SparseAssembly => "sparse-assembly",
217            Self::SpatialKernelOperator => "spatial-kernel-operator",
218            Self::MarginalSlopeRows => "marginal-slope-rows",
219            Self::RemlTrace => "reml-trace",
220            Self::FinalInference => "final-inference",
221        }
222    }
223}
224
225/// A backend-selection decision for a single hot kernel.
226#[derive(Clone, Debug)]
227pub struct GpuDecision {
228    pub policy: GpuPolicy,
229    pub kernel: GpuKernel,
230    pub use_gpu: bool,
231    pub reason: &'static str,
232}
233
234static POLICY: OnceLock<GpuPolicy> = OnceLock::new();
235
236#[inline]
237pub fn global_policy() -> GpuPolicy {
238    // Reading the policy must NOT claim the OnceLock slot: returning the
239    // default `Auto` via `get_or_init` would race against an explicit
240    // `configure_global_policy(...)` made later in the same process and
241    // silently lock the policy to `Auto`.  Keep the slot uninitialized
242    // until explicitly configured so first-writer-wins applies only to
243    // genuine writes, not to incidental reads from probe/dispatch code.
244    match POLICY.get() {
245        Some(p) => *p,
246        None => GpuPolicy::Auto,
247    }
248}
249
250/// Configure the process-wide policy before solver kernels are selected.
251/// If a previous explicit configuration already set the policy, the first
252/// value wins so concurrent fits cannot race policy changes.  Reads of
253/// `global_policy()` never claim the slot, so the very first explicit
254/// configuration always sticks even if dispatch code observed the
255/// default `Auto` beforehand.
256pub fn configure_global_policy(policy: GpuPolicy) {
257    // First-writer-wins semantics; ignore a redundant late call.
258    POLICY.set(policy).ok();
259}
260
261/// True when direct solver GPU entry points should be attempted.
262///
263/// `Auto` attempts CUDA only after the runtime probe finds a usable device.
264/// `Off` pins the process to CPU. `Force` attempts the GPU path so missing
265/// runtime/backend support becomes an explicit error at the callee instead of
266/// an implicit CPU route.
267#[inline]
268pub fn cuda_selected() -> bool {
269    match global_policy() {
270        GpuPolicy::Auto => device_runtime::GpuRuntime::is_available(),
271        GpuPolicy::Off => false,
272        GpuPolicy::Force => true,
273    }
274}
275
276/// Joint eligibility state for a GPU kernel at the call site.
277///
278/// Callers construct exactly one variant, which encodes both the compile-time
279/// backend presence and the runtime workload threshold check.  Replacing the
280/// former `(supported: bool, large_enough: bool)` pair removes the possibility
281/// of silently swapping the two flags at a call site: each meaningful state
282/// has exactly one constructor and the `match` inside [`decide`] is total.
283#[derive(Clone, Copy, Debug, Eq, PartialEq)]
284pub enum GpuEligibility {
285    /// Vendor backend is not compiled into this build for this kernel.
286    BackendNotCompiled,
287    /// Backend is compiled in, but the workload (n, m, ...) is below the
288    /// runtime threshold for this kernel.
289    WorkloadBelowThreshold,
290    /// Backend is compiled in and the workload is large enough; the only
291    /// remaining gates are policy and runtime probe.
292    Eligible,
293}
294
295impl GpuEligibility {
296    /// Combine the compile-time backend flag with the workload predicate into
297    /// the canonical joint state.  Use this only when you genuinely have two
298    /// independent booleans; otherwise prefer constructing a variant directly.
299    #[inline]
300    pub const fn from_flags(supported: bool, large_enough: bool) -> Self {
301        if !supported {
302            Self::BackendNotCompiled
303        } else if !large_enough {
304            Self::WorkloadBelowThreshold
305        } else {
306            Self::Eligible
307        }
308    }
309}
310
311/// Decide whether a GPU kernel may run. This is deliberately conservative:
312/// with no compiled vendor backend, `auto` returns CPU fallback and `force`
313/// returns an error at the call site through [`GpuDecision::require_supported`].
314pub fn decide(kernel: GpuKernel, eligibility: GpuEligibility) -> GpuDecision {
315    let policy = global_policy();
316    // Auto must consult the actual probed runtime, not only the
317    // compile-time eligibility.  Without this, `decide()` would claim
318    // GPU when the kernel is "compiled in" even though `GpuRuntime::global()`
319    // observed no device — silently producing CPU work via failed dispatch
320    // and hiding the cpu_reason from callers wanting to log fallback cause.
321    let runtime_available = device_runtime::GpuRuntime::is_available();
322    let (use_gpu, reason) = match (policy, eligibility) {
323        (GpuPolicy::Off, _) => (false, "cpu-gpu-policy-off"),
324        (GpuPolicy::Auto, GpuEligibility::BackendNotCompiled) => {
325            (false, "cpu-gpu-backend-not-compiled")
326        }
327        (GpuPolicy::Auto, _) if !runtime_available => (false, "cpu-gpu-runtime-unavailable"),
328        (GpuPolicy::Auto, GpuEligibility::WorkloadBelowThreshold) => {
329            (false, "cpu-workload-below-gpu-threshold")
330        }
331        (GpuPolicy::Auto, GpuEligibility::Eligible) => (true, "gpu-auto-supported"),
332        (GpuPolicy::Force, GpuEligibility::BackendNotCompiled) => {
333            (false, "cpu-gpu-force-unsupported")
334        }
335        (GpuPolicy::Force, _) if !runtime_available => (false, "cpu-gpu-force-runtime-unavailable"),
336        // Under `force`, the workload-threshold gate is intentionally bypassed:
337        // the user explicitly asked for GPU regardless of size.
338        (GpuPolicy::Force, GpuEligibility::WorkloadBelowThreshold)
339        | (GpuPolicy::Force, GpuEligibility::Eligible) => (true, "gpu-force-supported"),
340    };
341    GpuDecision {
342        policy,
343        kernel,
344        use_gpu,
345        reason,
346    }
347}
348
349impl GpuDecision {
350    pub fn require_supported(&self) -> Result<(), String> {
351        if self.policy == GpuPolicy::Force && !self.use_gpu {
352            return Err(format!(
353                "gpu=force requested kernel '{}' but no supported device backend is available ({})",
354                self.kernel.as_str(),
355                self.reason
356            ));
357        }
358        Ok(())
359    }
360
361    pub fn log(self) {
362        log::debug!(
363            "[GPU backend] kernel={} policy={} selected={} reason={}",
364            self.kernel.as_str(),
365            self.policy.as_str(),
366            self.use_gpu,
367            self.reason
368        );
369    }
370}
371
372/// Emit the roadmap-visible kernels at startup/debug time without affecting
373/// numerical execution. This keeps backend coverage auditable as real device
374/// kernels are added incrementally.
375pub fn log_backend_inventory_once() {
376    static LOGGED: OnceLock<()> = OnceLock::new();
377    LOGGED.get_or_init(|| {
378        let compiled_backends = if cfg!(target_os = "linux") {
379            "cuda-dynamic"
380        } else {
381            "none"
382        };
383        log::debug!(
384            "[GPU backend] policy={} compiled_backends={} kernels=dense-matvec,dense-transpose-matvec,dense-xtwx,candidate-screen,dense-solve,matrix-free-pcg,sparse-assembly,spatial-kernel-operator,marginal-slope-rows,reml-trace,final-inference",
385            global_policy().as_str(),
386            compiled_backends
387        );
388    });
389}
390
391#[inline]
392pub fn try_fast_ab(
393    a: ndarray::ArrayView2<'_, f64>,
394    b: ndarray::ArrayView2<'_, f64>,
395) -> Option<ndarray::Array2<f64>> {
396    linalg_dispatch::try_fast_ab(a, b)
397}
398#[inline]
399pub fn try_fast_atb_on_ordinal(
400    ordinal: usize,
401    a: ndarray::ArrayView2<'_, f64>,
402    b: ndarray::ArrayView2<'_, f64>,
403) -> Option<ndarray::Array2<f64>> {
404    linalg_dispatch::try_fast_atb_on_ordinal(ordinal, a, b)
405}
406#[inline]
407pub fn try_fast_av(
408    a: ndarray::ArrayView2<'_, f64>,
409    v: ndarray::ArrayView1<'_, f64>,
410) -> Option<ndarray::Array1<f64>> {
411    linalg_dispatch::try_fast_av(a, v)
412}
413#[inline]
414pub fn try_fast_atv(
415    a: ndarray::ArrayView2<'_, f64>,
416    v: ndarray::ArrayView1<'_, f64>,
417) -> Option<ndarray::Array1<f64>> {
418    linalg_dispatch::try_fast_atv(a, v)
419}
420#[inline]
421pub fn try_fast_ab_broadcast_b_batched(
422    a: ndarray::ArrayView3<'_, f64>,
423    b: ndarray::ArrayView2<'_, f64>,
424) -> Option<ndarray::Array3<f64>> {
425    linalg_dispatch::try_fast_ab_broadcast_b_batched(a, b)
426}
427#[inline]
428pub fn try_fast_abt_strided_batched(
429    a: ndarray::ArrayView3<'_, f64>,
430    b: ndarray::ArrayView3<'_, f64>,
431) -> Option<ndarray::Array3<f64>> {
432    linalg_dispatch::try_fast_abt_strided_batched(a, b)
433}
434#[inline]
435pub fn try_cholesky_lower_inplace(a: &mut ndarray::Array2<f64>) -> Option<()> {
436    linalg_dispatch::try_cholesky_lower_inplace(a)
437}
438#[inline]
439pub fn try_cholesky_batched_lower_inplace(matrices: &mut [ndarray::Array2<f64>]) -> Option<()> {
440    linalg_dispatch::try_cholesky_batched_lower_inplace(matrices)
441}
442#[inline]
443pub fn try_solve_lower_triangular_matrix(
444    lower: ndarray::ArrayView2<'_, f64>,
445    rhs: ndarray::ArrayView2<'_, f64>,
446) -> Option<ndarray::Array2<f64>> {
447    linalg_dispatch::try_solve_lower_triangular_matrix(lower, rhs)
448}
449#[inline]
450pub fn try_solve_upper_triangular_matrix(
451    upper: ndarray::ArrayView2<'_, f64>,
452    rhs: ndarray::ArrayView2<'_, f64>,
453) -> Option<ndarray::Array2<f64>> {
454    linalg_dispatch::try_solve_upper_triangular_matrix(upper, rhs)
455}
456#[cfg(test)]
457mod policy_tests {
458    use super::*;
459
460    #[test]
461    fn parses_canonical_user_gpu_policy_values() {
462        assert_eq!(GpuPolicy::parse("auto"), Some(GpuPolicy::Auto));
463        assert_eq!(GpuPolicy::parse("off"), Some(GpuPolicy::Off));
464        assert_eq!(GpuPolicy::parse("force"), Some(GpuPolicy::Force));
465        assert_eq!(GpuPolicy::parse("cpu"), None);
466        assert_eq!(GpuPolicy::parse(""), None);
467        assert_eq!(GpuPolicy::parse("wat"), None);
468    }
469
470    #[test]
471    fn execution_path_defaults_to_cpu() {
472        use gam_problem::ExecutionPath;
473        // The truthful execution-path classifier must default to the CPU path,
474        // so a result struct that is never told otherwise cannot claim the
475        // device (the original `used_device: bool` defaulted the same way, but
476        // now the "no device" state is a named, non-lying variant).
477        assert_eq!(ExecutionPath::default(), ExecutionPath::Cpu);
478        assert!(!ExecutionPath::Cpu.used_device());
479        assert!(ExecutionPath::GpuResidentFull.used_device());
480    }
481
482    #[test]
483    fn gpu_mode_required_fails_closed_when_device_absent() {
484        use crate::device_runtime::GpuRuntime;
485        // Off always refuses, regardless of hardware.
486        assert!(matches!(
487            GpuRuntime::global_or_fail(GpuMode::Off),
488            Err(GpuError::DriverLibraryUnavailable { .. })
489        ));
490
491        if GpuRuntime::is_available() {
492            // On a GPU host both Auto and Required must succeed.
493            assert!(GpuRuntime::global_or_fail(GpuMode::Required).is_ok());
494            assert!(GpuRuntime::global_or_fail(GpuMode::Auto).is_ok());
495        } else {
496            // Fail-closed: Required surfaces a STRUCTURED error rather than a
497            // silent CPU fallback. Auto also reports unavailable (callers there
498            // swallow it and fall back), but the variant is what lets Required
499            // propagate it as fatal.
500            let required = GpuRuntime::global_or_fail(GpuMode::Required);
501            assert!(
502                matches!(required, Err(GpuError::DriverLibraryUnavailable { .. })),
503                "GpuMode::Required must fail closed when the device is absent, got {required:?}"
504            );
505            assert!(GpuRuntime::global_or_fail(GpuMode::Auto).is_err());
506        }
507    }
508
509    #[test]
510    fn pirls_loop_admission_requires_runtime_size_and_known_family() {
511        use crate::policy::{PirlsLoopAdmission, PirlsLoopCurvatureKind, PirlsLoopFamilyKind};
512        let pol = GpuDispatchPolicy::default();
513        let base = PirlsLoopAdmission {
514            n: 80_000,
515            p: 44,
516            family: Some(PirlsLoopFamilyKind::BernoulliLogit),
517            curvature: PirlsLoopCurvatureKind::Fisher,
518            gpu_available: true,
519        };
520        assert!(pol.should_use_gpu_pirls_loop(base));
521        // No runtime → never dispatch.
522        assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission {
523            gpu_available: false,
524            ..base
525        }));
526        // Below dense-work floor.
527        assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission { n: 1_000, ..base }));
528        // Small n with large p is admitted because 2*n*p^2 clears the work floor.
529        assert!(pol.should_use_gpu_pirls_loop(PirlsLoopAdmission {
530            n: 2_000,
531            p: 2_048,
532            ..base
533        }));
534        // Below column floor.
535        assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission { p: 8, ..base }));
536        // Custom family (not in 6 JIT-cached set) declines.
537        assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission {
538            family: None,
539            ..base
540        }));
541    }
542
543    #[test]
544    fn force_policy_reports_unsupported_kernel() {
545        let decision = GpuDecision {
546            policy: GpuPolicy::Force,
547            kernel: GpuKernel::DenseXtWX,
548            use_gpu: false,
549            reason: "gpu-force-unsupported",
550        };
551        let err = decision.require_supported().unwrap_err();
552        assert!(err.contains("dense-xtwx"));
553        assert!(err.contains("gpu=force"));
554    }
555}