1#[macro_use]
18pub mod gpu_error;
19pub mod backend_probe;
20pub mod blas;
21#[cfg(target_os = "linux")]
22pub mod calibration;
23pub mod cpu_traits;
24pub mod device;
25pub mod device_cache;
26pub mod driver;
27pub mod device_runtime;
28pub mod linalg_dispatch;
29pub mod memory;
30pub mod numerics_device;
31pub mod numerics_host;
32pub mod policy;
33pub mod pool;
34pub mod profile;
35pub mod solver;
36
37pub mod kernels;
39
40pub use cpu_traits::{ExecutionTarget, MatrixLocation};
41pub use device::GpuDeviceInfo;
42pub use device_runtime::GpuRuntime;
43pub use gpu_error::GpuError;
44pub use memory::{DeviceBuffer, DeviceCsrMatrix, DeviceMatrix, DeviceVector};
45pub use policy::{GpuDispatchPolicy, GpuMixedPrecisionPolicy};
46pub use pool::{balanced_partition, scatter_batched};
47pub use profile::{GpuExecutionTelemetry, KernelStat, KernelStatsSnapshot};
48
49use serde::{Deserialize, Serialize};
62use std::fmt;
63use std::sync::OnceLock;
64
65#[derive(Clone, Copy, Debug, Eq, PartialEq)]
66pub enum CudaBackendStatus {
67 CudaUnavailable,
68 CudaReady,
69}
70
71#[inline]
72pub(crate) fn cuda_backend_status() -> CudaBackendStatus {
73 if device_runtime::GpuRuntime::global().is_some() {
74 CudaBackendStatus::CudaReady
75 } else {
76 CudaBackendStatus::CudaUnavailable
77 }
78}
79
80#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
82#[serde(rename_all = "kebab-case")]
83pub enum GpuPolicy {
84 #[default]
86 Auto,
87 Off,
89 Force,
91}
92
93impl GpuPolicy {
94 pub fn parse(raw: &str) -> Option<Self> {
95 match raw.trim().to_ascii_lowercase().as_str() {
96 "auto" => Some(Self::Auto),
97 "off" => Some(Self::Off),
98 "force" => Some(Self::Force),
99 _ => None,
100 }
101 }
102
103 #[inline]
104 pub const fn as_str(self) -> &'static str {
105 match self {
106 Self::Auto => "auto",
107 Self::Off => "off",
108 Self::Force => "force",
109 }
110 }
111
112 #[inline]
114 pub const fn is_force(self) -> bool {
115 matches!(self, Self::Force)
116 }
117}
118
119impl fmt::Display for GpuPolicy {
120 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
121 f.write_str(self.as_str())
122 }
123}
124
125#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Serialize, Deserialize)]
138#[serde(rename_all = "kebab-case")]
139pub enum GpuMode {
140 #[default]
142 Auto,
143 Required,
145 Off,
147}
148
149impl GpuMode {
150 #[inline]
152 pub const fn as_str(self) -> &'static str {
153 match self {
154 Self::Auto => "auto",
155 Self::Required => "required",
156 Self::Off => "off",
157 }
158 }
159}
160
161impl fmt::Display for GpuMode {
162 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
163 f.write_str(self.as_str())
164 }
165}
166
167static GPU_MODE: OnceLock<GpuMode> = OnceLock::new();
168
169pub fn set_gpu_mode(mode: GpuMode) {
172 GPU_MODE.set(mode).ok();
173}
174
175#[inline]
179pub fn gpu_mode() -> GpuMode {
180 match GPU_MODE.get() {
181 Some(m) => *m,
182 None => GpuMode::Auto,
183 }
184}
185
186#[derive(Clone, Copy, Debug, Eq, PartialEq)]
187pub enum GpuKernel {
188 DenseMatvec,
189 DenseTransposeMatvec,
190 DenseXtWX,
191 CandidateScreen,
192 DenseSolve,
193 MatrixFreePcg,
194 SparseAssembly,
195 SpatialKernelOperator,
196 MarginalSlopeRows,
197 RemlTrace,
198 FinalInference,
199}
200
201impl GpuKernel {
202 pub const fn as_str(self) -> &'static str {
203 match self {
204 Self::DenseMatvec => "dense-matvec",
205 Self::DenseTransposeMatvec => "dense-transpose-matvec",
206 Self::DenseXtWX => "dense-xtwx",
207 Self::CandidateScreen => "candidate-screen",
208 Self::DenseSolve => "dense-solve",
209 Self::MatrixFreePcg => "matrix-free-pcg",
210 Self::SparseAssembly => "sparse-assembly",
211 Self::SpatialKernelOperator => "spatial-kernel-operator",
212 Self::MarginalSlopeRows => "marginal-slope-rows",
213 Self::RemlTrace => "reml-trace",
214 Self::FinalInference => "final-inference",
215 }
216 }
217}
218
219#[derive(Clone, Debug)]
221pub struct GpuDecision {
222 pub policy: GpuPolicy,
223 pub kernel: GpuKernel,
224 pub use_gpu: bool,
225 pub reason: &'static str,
226}
227
228static POLICY: OnceLock<GpuPolicy> = OnceLock::new();
229
230#[inline]
231pub fn global_policy() -> GpuPolicy {
232 match POLICY.get() {
239 Some(p) => *p,
240 None => GpuPolicy::Auto,
241 }
242}
243
244pub fn configure_global_policy(policy: GpuPolicy) {
251 POLICY.set(policy).ok();
253}
254
255#[inline]
262pub fn cuda_selected() -> bool {
263 match global_policy() {
264 GpuPolicy::Auto => device_runtime::GpuRuntime::is_available(),
265 GpuPolicy::Off => false,
266 GpuPolicy::Force => true,
267 }
268}
269
270#[derive(Clone, Copy, Debug, Eq, PartialEq)]
278pub enum GpuEligibility {
279 BackendNotCompiled,
281 WorkloadBelowThreshold,
284 Eligible,
287}
288
289impl GpuEligibility {
290 #[inline]
294 pub const fn from_flags(supported: bool, large_enough: bool) -> Self {
295 if !supported {
296 Self::BackendNotCompiled
297 } else if !large_enough {
298 Self::WorkloadBelowThreshold
299 } else {
300 Self::Eligible
301 }
302 }
303}
304
305pub fn decide(kernel: GpuKernel, eligibility: GpuEligibility) -> GpuDecision {
309 let policy = global_policy();
310 let runtime_available = device_runtime::GpuRuntime::is_available();
316 let (use_gpu, reason) = match (policy, eligibility) {
317 (GpuPolicy::Off, _) => (false, "cpu-gpu-policy-off"),
318 (GpuPolicy::Auto, GpuEligibility::BackendNotCompiled) => {
319 (false, "cpu-gpu-backend-not-compiled")
320 }
321 (GpuPolicy::Auto, _) if !runtime_available => (false, "cpu-gpu-runtime-unavailable"),
322 (GpuPolicy::Auto, GpuEligibility::WorkloadBelowThreshold) => {
323 (false, "cpu-workload-below-gpu-threshold")
324 }
325 (GpuPolicy::Auto, GpuEligibility::Eligible) => (true, "gpu-auto-supported"),
326 (GpuPolicy::Force, GpuEligibility::BackendNotCompiled) => {
327 (false, "cpu-gpu-force-unsupported")
328 }
329 (GpuPolicy::Force, _) if !runtime_available => (false, "cpu-gpu-force-runtime-unavailable"),
330 (GpuPolicy::Force, GpuEligibility::WorkloadBelowThreshold)
333 | (GpuPolicy::Force, GpuEligibility::Eligible) => (true, "gpu-force-supported"),
334 };
335 GpuDecision {
336 policy,
337 kernel,
338 use_gpu,
339 reason,
340 }
341}
342
343impl GpuDecision {
344 pub fn require_supported(&self) -> Result<(), String> {
345 if self.policy == GpuPolicy::Force && !self.use_gpu {
346 return Err(format!(
347 "gpu=force requested kernel '{}' but no supported device backend is available ({})",
348 self.kernel.as_str(),
349 self.reason
350 ));
351 }
352 Ok(())
353 }
354
355 pub fn log(self) {
356 log::debug!(
357 "[GPU backend] kernel={} policy={} selected={} reason={}",
358 self.kernel.as_str(),
359 self.policy.as_str(),
360 self.use_gpu,
361 self.reason
362 );
363 }
364}
365
366pub fn log_backend_inventory_once() {
370 static LOGGED: OnceLock<()> = OnceLock::new();
371 LOGGED.get_or_init(|| {
372 let compiled_backends = if cfg!(target_os = "linux") {
373 "cuda-dynamic"
374 } else {
375 "none"
376 };
377 log::debug!(
378 "[GPU backend] policy={} compiled_backends={} kernels=dense-matvec,dense-transpose-matvec,dense-xtwx,candidate-screen,dense-solve,matrix-free-pcg,sparse-assembly,spatial-kernel-operator,marginal-slope-rows,reml-trace,final-inference",
379 global_policy().as_str(),
380 compiled_backends
381 );
382 });
383}
384
385#[inline]
386pub fn try_fast_ab(
387 a: ndarray::ArrayView2<'_, f64>,
388 b: ndarray::ArrayView2<'_, f64>,
389) -> Option<ndarray::Array2<f64>> {
390 linalg_dispatch::try_fast_ab(a, b)
391}
392#[inline]
393pub fn try_fast_atb_on_ordinal(
394 ordinal: usize,
395 a: ndarray::ArrayView2<'_, f64>,
396 b: ndarray::ArrayView2<'_, f64>,
397) -> Option<ndarray::Array2<f64>> {
398 linalg_dispatch::try_fast_atb_on_ordinal(ordinal, a, b)
399}
400#[inline]
401pub fn try_fast_av(
402 a: ndarray::ArrayView2<'_, f64>,
403 v: ndarray::ArrayView1<'_, f64>,
404) -> Option<ndarray::Array1<f64>> {
405 linalg_dispatch::try_fast_av(a, v)
406}
407#[inline]
408pub fn try_fast_atv(
409 a: ndarray::ArrayView2<'_, f64>,
410 v: ndarray::ArrayView1<'_, f64>,
411) -> Option<ndarray::Array1<f64>> {
412 linalg_dispatch::try_fast_atv(a, v)
413}
414#[inline]
415pub fn try_fast_ab_broadcast_b_batched(
416 a: ndarray::ArrayView3<'_, f64>,
417 b: ndarray::ArrayView2<'_, f64>,
418) -> Option<ndarray::Array3<f64>> {
419 linalg_dispatch::try_fast_ab_broadcast_b_batched(a, b)
420}
421#[inline]
422pub fn try_fast_abt_strided_batched(
423 a: ndarray::ArrayView3<'_, f64>,
424 b: ndarray::ArrayView3<'_, f64>,
425) -> Option<ndarray::Array3<f64>> {
426 linalg_dispatch::try_fast_abt_strided_batched(a, b)
427}
428#[inline]
429pub fn try_cholesky_lower_inplace(a: &mut ndarray::Array2<f64>) -> Option<()> {
430 linalg_dispatch::try_cholesky_lower_inplace(a)
431}
432#[inline]
433pub fn try_cholesky_batched_lower_inplace(matrices: &mut [ndarray::Array2<f64>]) -> Option<()> {
434 linalg_dispatch::try_cholesky_batched_lower_inplace(matrices)
435}
436#[inline]
437pub fn try_solve_lower_triangular_matrix(
438 lower: ndarray::ArrayView2<'_, f64>,
439 rhs: ndarray::ArrayView2<'_, f64>,
440) -> Option<ndarray::Array2<f64>> {
441 linalg_dispatch::try_solve_lower_triangular_matrix(lower, rhs)
442}
443#[inline]
444pub fn try_solve_upper_triangular_matrix(
445 upper: ndarray::ArrayView2<'_, f64>,
446 rhs: ndarray::ArrayView2<'_, f64>,
447) -> Option<ndarray::Array2<f64>> {
448 linalg_dispatch::try_solve_upper_triangular_matrix(upper, rhs)
449}
450#[cfg(test)]
451mod policy_tests {
452 use super::*;
453
454 #[test]
455 fn parses_canonical_user_gpu_policy_values() {
456 assert_eq!(GpuPolicy::parse("auto"), Some(GpuPolicy::Auto));
457 assert_eq!(GpuPolicy::parse("off"), Some(GpuPolicy::Off));
458 assert_eq!(GpuPolicy::parse("force"), Some(GpuPolicy::Force));
459 assert_eq!(GpuPolicy::parse("cpu"), None);
460 assert_eq!(GpuPolicy::parse(""), None);
461 assert_eq!(GpuPolicy::parse("wat"), None);
462 }
463
464 #[test]
465 fn execution_path_defaults_to_cpu() {
466 use gam_problem::ExecutionPath;
467 assert_eq!(ExecutionPath::default(), ExecutionPath::Cpu);
472 assert!(!ExecutionPath::Cpu.used_device());
473 assert!(ExecutionPath::GpuResidentFull.used_device());
474 }
475
476 #[test]
477 fn gpu_mode_required_fails_closed_when_device_absent() {
478 use crate::device_runtime::GpuRuntime;
479 assert!(matches!(
481 GpuRuntime::global_or_fail(GpuMode::Off),
482 Err(GpuError::DriverLibraryUnavailable { .. })
483 ));
484
485 if GpuRuntime::is_available() {
486 assert!(GpuRuntime::global_or_fail(GpuMode::Required).is_ok());
488 assert!(GpuRuntime::global_or_fail(GpuMode::Auto).is_ok());
489 } else {
490 let required = GpuRuntime::global_or_fail(GpuMode::Required);
495 assert!(
496 matches!(required, Err(GpuError::DriverLibraryUnavailable { .. })),
497 "GpuMode::Required must fail closed when the device is absent, got {required:?}"
498 );
499 assert!(GpuRuntime::global_or_fail(GpuMode::Auto).is_err());
500 }
501 }
502
503 #[test]
504 fn pirls_loop_admission_requires_runtime_size_and_known_family() {
505 use crate::policy::{PirlsLoopAdmission, PirlsLoopCurvatureKind, PirlsLoopFamilyKind};
506 let pol = GpuDispatchPolicy::default();
507 let base = PirlsLoopAdmission {
508 n: 80_000,
509 p: 44,
510 family: Some(PirlsLoopFamilyKind::BernoulliLogit),
511 curvature: PirlsLoopCurvatureKind::Fisher,
512 gpu_available: true,
513 };
514 assert!(pol.should_use_gpu_pirls_loop(base));
515 assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission {
517 gpu_available: false,
518 ..base
519 }));
520 assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission { n: 1_000, ..base }));
522 assert!(pol.should_use_gpu_pirls_loop(PirlsLoopAdmission {
524 n: 2_000,
525 p: 2_048,
526 ..base
527 }));
528 assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission { p: 8, ..base }));
530 assert!(!pol.should_use_gpu_pirls_loop(PirlsLoopAdmission {
532 family: None,
533 ..base
534 }));
535 }
536
537 #[test]
538 fn force_policy_reports_unsupported_kernel() {
539 let decision = GpuDecision {
540 policy: GpuPolicy::Force,
541 kernel: GpuKernel::DenseXtWX,
542 use_gpu: false,
543 reason: "gpu-force-unsupported",
544 };
545 let err = decision.require_supported().unwrap_err();
546 assert!(err.contains("dense-xtwx"));
547 assert!(err.contains("gpu=force"));
548 }
549}