gam_gpu/device.rs
1#[derive(Clone, Debug, Eq, PartialEq)]
2pub struct GpuCapability {
3 pub compute_major: i32,
4 pub compute_minor: i32,
5 pub has_tensor_cores: bool,
6 pub has_fp64_tensor_cores: bool,
7 pub has_async_copy: bool,
8 pub has_cluster_launch: bool,
9 pub has_tma: bool,
10 pub min_warp_size: i32,
11}
12
13impl GpuCapability {
14 pub const fn from_compute_capability(major: i32, minor: i32) -> Self {
15 Self {
16 compute_major: major,
17 compute_minor: minor,
18 has_tensor_cores: major >= 7,
19 has_fp64_tensor_cores: major >= 8,
20 has_async_copy: major >= 8,
21 has_cluster_launch: major >= 9,
22 has_tma: major >= 9,
23 min_warp_size: 32,
24 }
25 }
26
27 /// NVRTC `--gpu-architecture` virtual-arch string for this device's compute
28 /// capability (e.g. `compute_80` for an A100 `8.0`).
29 ///
30 /// Critical for NVRTC correctness, not just performance: with no
31 /// `--gpu-architecture`, NVRTC defaults to a virtual arch below `sm_60`,
32 /// where the `atomicAdd(double*, double)` overload (added in compute
33 /// capability 6.0) does not exist. A kernel source using `double` atomics
34 /// (the SAE arrow/Schur PCG kernels do) then fails to compile, the module
35 /// load Errs, and the whole device path silently falls back to the CPU.
36 /// Keying the arch to the real device capability admits those kernels.
37 ///
38 /// Returns a `&'static str` because `cudarc`'s `CompileOptions::arch` is
39 /// `Option<&'static str>`. Unknown/future capabilities round DOWN to the
40 /// nearest known major to stay valid for the installed NVRTC, never up
41 /// (an arch newer than the toolkit knows would itself fail to compile).
42 #[must_use]
43 pub const fn nvrtc_arch(&self) -> &'static str {
44 match (self.compute_major, self.compute_minor) {
45 // Newer-than-known majors round DOWN to compute_90 to stay valid for
46 // the installed NVRTC (an arch the toolkit doesn't know would itself
47 // fail to compile); refine when the toolkit/cudarc gain the arch.
48 (major, _) if major >= 9 => "compute_90",
49 (8, 9) => "compute_89",
50 (8, 6) => "compute_86",
51 (8, _) => "compute_80",
52 (7, 5) => "compute_75",
53 (7, _) => "compute_70",
54 // 6.x and anything below: pin the lowest arch that still has the
55 // `double` atomicAdd so a 6.x-built toolkit accepts the source. gam
56 // requires CC >= 6.0 in practice for the double-atomic kernels.
57 _ => "compute_60",
58 }
59 }
60}
61
62#[derive(Clone, Debug, Eq, PartialEq)]
63pub struct GpuDeviceInfo {
64 pub ordinal: usize,
65 pub name: String,
66 pub capability: GpuCapability,
67 pub sm_count: i32,
68 pub max_threads_per_sm: i32,
69 pub max_shared_mem_per_block: usize,
70 pub l2_cache_bytes: usize,
71 pub total_mem_bytes: usize,
72 pub free_mem_bytes: usize,
73 pub ecc_enabled: bool,
74 pub integrated: bool,
75 pub mig_mode: bool,
76}
77
78impl GpuDeviceInfo {
79 /// Fraction of a device's *total* VRAM that any single dispatch is allowed
80 /// to budget against. The per-device budget is `min(free, total ·
81 /// MEMORY_BUDGET_TOTAL_FRACTION)`: free memory is the hard ceiling, but we
82 /// cap at half of *total* so that even on a freshly idle device we leave
83 /// headroom for the driver context, cuBLAS/cuSOLVER workspaces, and a
84 /// second concurrent tile from the multi-GPU pool. Denominator `2` ⇒ half.
85 const MEMORY_BUDGET_TOTAL_DIVISOR: usize = 2;
86
87 /// Per-device byte budget a dispatch may size its buffers against:
88 /// `min(free_mem, total_mem / MEMORY_BUDGET_TOTAL_DIVISOR)`. Single source
89 /// of truth for both the primary-device budget (`device_runtime::probe`) and the
90 /// per-ordinal pool budget (`GpuRuntime::memory_budget_for`).
91 #[must_use]
92 pub const fn memory_budget_bytes(&self) -> usize {
93 let half_total = self.total_mem_bytes / Self::MEMORY_BUDGET_TOTAL_DIVISOR;
94 if self.free_mem_bytes < half_total {
95 self.free_mem_bytes
96 } else {
97 half_total
98 }
99 }
100
101 pub fn score(&self) -> f64 {
102 let fp64_bonus = if self.capability.has_fp64_tensor_cores {
103 100.0
104 } else {
105 0.0
106 };
107 let async_bonus = if self.capability.has_async_copy {
108 50.0
109 } else {
110 0.0
111 };
112 f64::from(self.sm_count)
113 + (self.free_mem_bytes as f64 / 1_073_741_824.0) * 4.0
114 + fp64_bonus
115 + async_bonus
116 }
117}