1#[macro_use]
18pub mod gpu_error;
19pub mod backend_probe;
20pub mod blas;
21#[cfg(target_os = "linux")]
22pub mod calibration;
23pub mod cpu_traits;
24pub mod device;
25pub mod device_cache;
26pub mod driver;
27pub mod device_runtime;
28pub mod encode_throughput;
29pub mod linalg_dispatch;
30pub mod memory;
31pub mod numerics_device;
32pub mod numerics_host;
33pub mod policy;
34pub mod pool;
35pub mod profile;
36pub mod solver;
37
38pub mod kernels;
40
41pub use cpu_traits::MatrixLocation;
42pub use device::GpuDeviceInfo;
43pub use device_runtime::GpuRuntime;
44pub use gpu_error::GpuError;
45pub use memory::{DeviceBuffer, DeviceCsrMatrix, DeviceMatrix, DeviceVector};
46pub use policy::{GpuDispatchPolicy, GpuMixedPrecisionPolicy};
47pub use pool::{balanced_partition, scatter_batched};
48pub use profile::{GpuExecutionTelemetry, KernelStat, KernelStatsSnapshot};
49
50use serde::{Deserialize, Serialize};
63use std::fmt;
64use std::sync::OnceLock;
65
66#[derive(Clone, Copy, Debug, Eq, PartialEq)]
67pub enum CudaBackendStatus {
68 CudaUnavailable,
69 CudaReady,
70}
71
72#[inline]
73pub(crate) fn cuda_backend_status() -> CudaBackendStatus {
74 if device_runtime::GpuRuntime::global().is_some() {
75 CudaBackendStatus::CudaReady
76 } else {
77 CudaBackendStatus::CudaUnavailable
78 }
79}
80
81#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
83#[serde(rename_all = "kebab-case")]
84pub enum GpuPolicy {
85 #[default]
87 Auto,
88 Off,
90 Force,
92}
93
94impl GpuPolicy {
95 pub fn parse(raw: &str) -> Option<Self> {
96 match raw.trim().to_ascii_lowercase().as_str() {
97 "auto" => Some(Self::Auto),
98 "off" => Some(Self::Off),
99 "force" => Some(Self::Force),
100 _ => None,
101 }
102 }
103
104 #[inline]
105 pub const fn as_str(self) -> &'static str {
106 match self {
107 Self::Auto => "auto",
108 Self::Off => "off",
109 Self::Force => "force",
110 }
111 }
112
113 #[inline]
115 pub const fn is_force(self) -> bool {
116 matches!(self, Self::Force)
117 }
118}
119
120impl fmt::Display for GpuPolicy {
121 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
122 f.write_str(self.as_str())
123 }
124}
125
126#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
139#[serde(rename_all = "kebab-case")]
140pub enum GpuMode {
141 #[default]
143 Auto,
144 Required,
146 Off,
148}
149
150impl GpuMode {
151 #[inline]
153 pub const fn as_str(self) -> &'static str {
154 match self {
155 Self::Auto => "auto",
156 Self::Required => "required",
157 Self::Off => "off",
158 }
159 }
160}
161
162impl fmt::Display for GpuMode {
163 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
164 f.write_str(self.as_str())
165 }
166}
167
168static GPU_MODE: OnceLock<GpuMode> = OnceLock::new();
169
170pub fn set_gpu_mode(mode: GpuMode) {
173 GPU_MODE.set(mode).ok();
174}
175
176#[inline]
180pub fn gpu_mode() -> GpuMode {
181 match GPU_MODE.get() {
182 Some(m) => *m,
183 None => GpuMode::Auto,
184 }
185}
186
187#[derive(Clone, Copy, Debug, Eq, PartialEq)]
188pub enum GpuKernel {
189 DenseMatvec,
190 DenseTransposeMatvec,
191 DenseXtWX,
192 CandidateScreen,
193 DenseSolve,
194 MatrixFreePcg,
195 SparseAssembly,
196 SpatialKernelOperator,
197 MarginalSlopeRows,
198 RemlTrace,
199 FinalInference,
200}
201
202impl GpuKernel {
203 pub const fn as_str(self) -> &'static str {
204 match self {
205 Self::DenseMatvec => "dense-matvec",
206 Self::DenseTransposeMatvec => "dense-transpose-matvec",
207 Self::DenseXtWX => "dense-xtwx",
208 Self::CandidateScreen => "candidate-screen",
209 Self::DenseSolve => "dense-solve",
210 Self::MatrixFreePcg => "matrix-free-pcg",
211 Self::SparseAssembly => "sparse-assembly",
212 Self::SpatialKernelOperator => "spatial-kernel-operator",
213 Self::MarginalSlopeRows => "marginal-slope-rows",
214 Self::RemlTrace => "reml-trace",
215 Self::FinalInference => "final-inference",
216 }
217 }
218}
219
220#[derive(Clone, Debug)]
222pub struct GpuDecision {
223 pub policy: GpuPolicy,
224 pub kernel: GpuKernel,
225 pub use_gpu: bool,
226 pub reason: &'static str,
227}
228
229static POLICY: OnceLock<GpuPolicy> = OnceLock::new();
230
231#[inline]
232pub fn global_policy() -> GpuPolicy {
233 match POLICY.get() {
240 Some(p) => *p,
241 None => GpuPolicy::Auto,
242 }
243}
244
245pub fn configure_global_policy(policy: GpuPolicy) {
252 POLICY.set(policy).ok();
254}
255
256#[inline]
263pub fn cuda_selected() -> bool {
264 match global_policy() {
265 GpuPolicy::Auto => device_runtime::GpuRuntime::is_available(),
266 GpuPolicy::Off => false,
267 GpuPolicy::Force => true,
268 }
269}
270
271#[derive(Clone, Copy, Debug, Eq, PartialEq)]
279pub enum GpuEligibility {
280 BackendNotCompiled,
282 WorkloadBelowThreshold,
285 Eligible,
288}
289
290impl GpuEligibility {
291 #[inline]
295 pub const fn from_flags(supported: bool, large_enough: bool) -> Self {
296 if !supported {
297 Self::BackendNotCompiled
298 } else if !large_enough {
299 Self::WorkloadBelowThreshold
300 } else {
301 Self::Eligible
302 }
303 }
304}
305
306pub fn decide(kernel: GpuKernel, eligibility: GpuEligibility) -> GpuDecision {
310 let policy = global_policy();
311 let runtime_available = device_runtime::GpuRuntime::is_available();
317 let (use_gpu, reason) = match (policy, eligibility) {
318 (GpuPolicy::Off, _) => (false, "cpu-gpu-policy-off"),
319 (GpuPolicy::Auto, GpuEligibility::BackendNotCompiled) => {
320 (false, "cpu-gpu-backend-not-compiled")
321 }
322 (GpuPolicy::Auto, _) if !runtime_available => (false, "cpu-gpu-runtime-unavailable"),
323 (GpuPolicy::Auto, GpuEligibility::WorkloadBelowThreshold) => {
324 (false, "cpu-workload-below-gpu-threshold")
325 }
326 (GpuPolicy::Auto, GpuEligibility::Eligible) => (true, "gpu-auto-supported"),
327 (GpuPolicy::Force, GpuEligibility::BackendNotCompiled) => {
328 (false, "cpu-gpu-force-unsupported")
329 }
330 (GpuPolicy::Force, _) if !runtime_available => (false, "cpu-gpu-force-runtime-unavailable"),
331 (GpuPolicy::Force, GpuEligibility::WorkloadBelowThreshold)
334 | (GpuPolicy::Force, GpuEligibility::Eligible) => (true, "gpu-force-supported"),
335 };
336 GpuDecision {
337 policy,
338 kernel,
339 use_gpu,
340 reason,
341 }
342}
343
344impl GpuDecision {
345 pub fn require_supported(&self) -> Result<(), String> {
346 if self.policy == GpuPolicy::Force && !self.use_gpu {
347 return Err(format!(
348 "gpu=force requested kernel '{}' but no supported device backend is available ({})",
349 self.kernel.as_str(),
350 self.reason
351 ));
352 }
353 Ok(())
354 }
355
356 pub fn log(self) {
357 log::debug!(
358 "[GPU backend] kernel={} policy={} selected={} reason={}",
359 self.kernel.as_str(),
360 self.policy.as_str(),
361 self.use_gpu,
362 self.reason
363 );
364 }
365}
366
367pub fn log_backend_inventory_once() {
371 static LOGGED: OnceLock<()> = OnceLock::new();
372 LOGGED.get_or_init(|| {
373 let compiled_backends = if cfg!(target_os = "linux") {
374 "cuda-dynamic"
375 } else {
376 "none"
377 };
378 log::debug!(
379 "[GPU backend] policy={} compiled_backends={} kernels=dense-matvec,dense-transpose-matvec,dense-xtwx,candidate-screen,dense-solve,matrix-free-pcg,sparse-assembly,spatial-kernel-operator,marginal-slope-rows,reml-trace,final-inference",
380 global_policy().as_str(),
381 compiled_backends
382 );
383 });
384}
385
386#[inline]
387pub fn try_fast_ab(
388 a: ndarray::ArrayView2<'_, f64>,
389 b: ndarray::ArrayView2<'_, f64>,
390) -> Option<ndarray::Array2<f64>> {
391 linalg_dispatch::try_fast_ab(a, b)
392}
393#[inline]
394pub fn try_fast_atb_on_ordinal(
395 ordinal: usize,
396 a: ndarray::ArrayView2<'_, f64>,
397 b: ndarray::ArrayView2<'_, f64>,
398) -> Option<ndarray::Array2<f64>> {
399 linalg_dispatch::try_fast_atb_on_ordinal(ordinal, a, b)
400}
401#[inline]
402pub fn try_fast_av(
403 a: ndarray::ArrayView2<'_, f64>,
404 v: ndarray::ArrayView1<'_, f64>,
405) -> Option<ndarray::Array1<f64>> {
406 linalg_dispatch::try_fast_av(a, v)
407}
408#[inline]
409pub fn try_fast_atv(
410 a: ndarray::ArrayView2<'_, f64>,
411 v: ndarray::ArrayView1<'_, f64>,
412) -> Option<ndarray::Array1<f64>> {
413 linalg_dispatch::try_fast_atv(a, v)
414}
415#[inline]
416pub fn try_fast_ab_broadcast_b_batched(
417 a: ndarray::ArrayView3<'_, f64>,
418 b: ndarray::ArrayView2<'_, f64>,
419) -> Option<ndarray::Array3<f64>> {
420 linalg_dispatch::try_fast_ab_broadcast_b_batched(a, b)
421}
422#[inline]
423pub fn try_fast_abt_strided_batched(
424 a: ndarray::ArrayView3<'_, f64>,
425 b: ndarray::ArrayView3<'_, f64>,
426) -> Option<ndarray::Array3<f64>> {
427 linalg_dispatch::try_fast_abt_strided_batched(a, b)
428}
429#[inline]
430pub fn try_cholesky_lower_inplace(a: &mut ndarray::Array2<f64>) -> Option<()> {
431 linalg_dispatch::try_cholesky_lower_inplace(a)
432}
433#[inline]
434pub fn try_cholesky_batched_lower_inplace(matrices: &mut [ndarray::Array2<f64>]) -> Option<()> {
435 linalg_dispatch::try_cholesky_batched_lower_inplace(matrices)
436}
437#[inline]
438pub fn try_solve_lower_triangular_matrix(
439 lower: ndarray::ArrayView2<'_, f64>,
440 rhs: ndarray::ArrayView2<'_, f64>,
441) -> Option<ndarray::Array2<f64>> {
442 linalg_dispatch::try_solve_lower_triangular_matrix(lower, rhs)
443}
444#[inline]
445pub fn try_solve_upper_triangular_matrix(
446 upper: ndarray::ArrayView2<'_, f64>,
447 rhs: ndarray::ArrayView2<'_, f64>,
448) -> Option<ndarray::Array2<f64>> {
449 linalg_dispatch::try_solve_upper_triangular_matrix(upper, rhs)
450}
451#[cfg(test)]
452mod policy_tests {
453 use super::*;
454
455 #[test]
456 fn parses_canonical_user_gpu_policy_values() {
457 assert_eq!(GpuPolicy::parse("auto"), Some(GpuPolicy::Auto));
458 assert_eq!(GpuPolicy::parse("off"), Some(GpuPolicy::Off));
459 assert_eq!(GpuPolicy::parse("force"), Some(GpuPolicy::Force));
460 assert_eq!(GpuPolicy::parse("cpu"), None);
461 assert_eq!(GpuPolicy::parse(""), None);
462 assert_eq!(GpuPolicy::parse("wat"), None);
463 }
464
465 #[test]
466 fn execution_path_defaults_to_cpu() {
467 use gam_problem::ExecutionPath;
468 assert_eq!(ExecutionPath::default(), ExecutionPath::Cpu);
473 assert!(!ExecutionPath::Cpu.used_device());
474 assert!(ExecutionPath::GpuResidentFull.used_device());
475 }
476
477 #[test]
478 fn gpu_mode_required_fails_closed_when_device_absent() {
479 use crate::device_runtime::GpuRuntime;
480 assert!(matches!(
482 GpuRuntime::global_or_fail(GpuMode::Off),
483 Err(GpuError::DriverLibraryUnavailable { .. })
484 ));
485
486 if GpuRuntime::is_available() {
487 assert!(GpuRuntime::global_or_fail(GpuMode::Required).is_ok());
489 assert!(GpuRuntime::global_or_fail(GpuMode::Auto).is_ok());
490 } else {
491 let required = GpuRuntime::global_or_fail(GpuMode::Required);
496 assert!(
497 matches!(required, Err(GpuError::DriverLibraryUnavailable { .. })),
498 "GpuMode::Required must fail closed when the device is absent, got {required:?}"
499 );
500 assert!(GpuRuntime::global_or_fail(GpuMode::Auto).is_err());
501 }
502 }
503
504 #[test]
505 fn pirls_loop_admission_requires_runtime_size_and_known_family() {
506 use crate::policy::{PirlsLoopAdmission, PirlsLoopCurvatureKind, PirlsLoopFamilyKind};
507 let pol = GpuDispatchPolicy::default();
508 let base = PirlsLoopAdmission {
509 n: 80_000,
510 p: 44,
511 family: Some(PirlsLoopFamilyKind::BernoulliLogit),
512 curvature: PirlsLoopCurvatureKind::Fisher,
513 gpu_available: true,
514 };
515 assert!(pol.should_use_gpu_pirls_loop(base));
516 assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission {
518 gpu_available: false,
519 ..base
520 }));
521 assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission { n: 1_000, ..base }));
523 assert!(pol.should_use_gpu_pirls_loop(PirlsLoopAdmission {
525 n: 2_000,
526 p: 2_048,
527 ..base
528 }));
529 assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission { p: 8, ..base }));
531 assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission {
533 family: None,
534 ..base
535 }));
536 }
537
538 #[test]
539 fn force_policy_reports_unsupported_kernel() {
540 let decision = GpuDecision {
541 policy: GpuPolicy::Force,
542 kernel: GpuKernel::DenseXtWX,
543 use_gpu: false,
544 reason: "gpu-force-unsupported",
545 };
546 let err = decision.require_supported().unwrap_err();
547 assert!(err.contains("dense-xtwx"));
548 assert!(err.contains("gpu=force"));
549 }
550}