gam_gpu/
device.rs

1#[derive(Clone, Debug, Eq, PartialEq)]
2pub struct GpuCapability {
3    pub compute_major: i32,
4    pub compute_minor: i32,
5    pub has_tensor_cores: bool,
6    pub has_fp64_tensor_cores: bool,
7    pub has_async_copy: bool,
8    pub has_cluster_launch: bool,
9    pub has_tma: bool,
10    pub min_warp_size: i32,
11}
12
13impl GpuCapability {
14    pub const fn from_compute_capability(major: i32, minor: i32) -> Self {
15        Self {
16            compute_major: major,
17            compute_minor: minor,
18            has_tensor_cores: major >= 7,
19            has_fp64_tensor_cores: major >= 8,
20            has_async_copy: major >= 8,
21            has_cluster_launch: major >= 9,
22            has_tma: major >= 9,
23            min_warp_size: 32,
24        }
25    }
26
27    /// NVRTC `--gpu-architecture` virtual-arch string for this device's compute
28    /// capability (e.g. `compute_80` for an A100 `8.0`).
29    ///
30    /// Critical for NVRTC correctness, not just performance: with no
31    /// `--gpu-architecture`, NVRTC defaults to a virtual arch below `sm_60`,
32    /// where the `atomicAdd(double*, double)` overload (added in compute
33    /// capability 6.0) does not exist. A kernel source using `double` atomics
34    /// (the SAE arrow/Schur PCG kernels do) then fails to compile, the module
35    /// load Errs, and the whole device path silently falls back to the CPU.
36    /// Keying the arch to the real device capability admits those kernels.
37    ///
38    /// Returns a `&'static str` because `cudarc`'s `CompileOptions::arch` is
39    /// `Option<&'static str>`. Unknown/future capabilities round DOWN to the
40    /// nearest known major to stay valid for the installed NVRTC, never up
41    /// (an arch newer than the toolkit knows would itself fail to compile).
42    #[must_use]
43    pub const fn nvrtc_arch(&self) -> &'static str {
44        match (self.compute_major, self.compute_minor) {
45            // Newer-than-known majors round DOWN to compute_90 to stay valid for
46            // the installed NVRTC (an arch the toolkit doesn't know would itself
47            // fail to compile); refine when the toolkit/cudarc gain the arch.
48            (major, _) if major >= 9 => "compute_90",
49            (8, 9) => "compute_89",
50            (8, 6) => "compute_86",
51            (8, _) => "compute_80",
52            (7, 5) => "compute_75",
53            (7, _) => "compute_70",
54            // 6.x and anything below: pin the lowest arch that still has the
55            // `double` atomicAdd so a 6.x-built toolkit accepts the source. gam
56            // requires CC >= 6.0 in practice for the double-atomic kernels.
57            _ => "compute_60",
58        }
59    }
60}
61
62#[derive(Clone, Debug, Eq, PartialEq)]
63pub struct GpuDeviceInfo {
64    pub ordinal: usize,
65    pub name: String,
66    pub capability: GpuCapability,
67    pub sm_count: i32,
68    pub max_threads_per_sm: i32,
69    pub max_shared_mem_per_block: usize,
70    pub l2_cache_bytes: usize,
71    pub total_mem_bytes: usize,
72    pub free_mem_bytes: usize,
73    pub ecc_enabled: bool,
74    pub integrated: bool,
75    pub mig_mode: bool,
76}
77
78impl GpuDeviceInfo {
79    /// Fraction of a device's *total* VRAM that any single dispatch is allowed
80    /// to budget against. The per-device budget is `min(free, total ·
81    /// MEMORY_BUDGET_TOTAL_FRACTION)`: free memory is the hard ceiling, but we
82    /// cap at half of *total* so that even on a freshly idle device we leave
83    /// headroom for the driver context, cuBLAS/cuSOLVER workspaces, and a
84    /// second concurrent tile from the multi-GPU pool. Denominator `2` ⇒ half.
85    const MEMORY_BUDGET_TOTAL_DIVISOR: usize = 2;
86
87    /// Per-device byte budget a dispatch may size its buffers against:
88    /// `min(free_mem, total_mem / MEMORY_BUDGET_TOTAL_DIVISOR)`. Single source
89    /// of truth for both the primary-device budget (`device_runtime::probe`) and the
90    /// per-ordinal pool budget (`GpuRuntime::memory_budget_for`).
91    #[must_use]
92    pub const fn memory_budget_bytes(&self) -> usize {
93        let half_total = self.total_mem_bytes / Self::MEMORY_BUDGET_TOTAL_DIVISOR;
94        if self.free_mem_bytes < half_total {
95            self.free_mem_bytes
96        } else {
97            half_total
98        }
99    }
100
101    pub fn score(&self) -> f64 {
102        let fp64_bonus = if self.capability.has_fp64_tensor_cores {
103            100.0
104        } else {
105            0.0
106        };
107        let async_bonus = if self.capability.has_async_copy {
108            50.0
109        } else {
110            0.0
111        };
112        f64::from(self.sm_count)
113            + (self.free_mem_bytes as f64 / 1_073_741_824.0) * 4.0
114            + fp64_bonus
115            + async_bonus
116    }
117}
118
119#[cfg(test)]
120mod nvrtc_arch_tests {
121    use super::GpuCapability;
122
123    fn arch_for(major: i32, minor: i32) -> &'static str {
124        GpuCapability::from_compute_capability(major, minor).nvrtc_arch()
125    }
126
127    /// Parse the numeric `NN` out of an NVRTC `compute_NN` virtual-arch string.
128    fn arch_number(arch: &str) -> i32 {
129        arch.strip_prefix("compute_")
130            .and_then(|n| n.parse::<i32>().ok())
131            .unwrap_or_else(|| panic!("nvrtc_arch must be a `compute_NN` string, got {arch}"))
132    }
133
134    /// #1551 — the exact device-capability → NVRTC virtual-arch mapping the SAE
135    /// double-atomic PCG kernels rely on. A wrong arch here re-introduces the
136    /// "GPU 0%" silent CPU fallback (NVRTC rejecting `atomicAdd(double*,double)`).
137    #[test]
138    fn nvrtc_arch_maps_known_capabilities() {
139        assert_eq!(arch_for(9, 0), "compute_90"); // Hopper H100
140        assert_eq!(arch_for(8, 9), "compute_89"); // Ada L4 / L40
141        assert_eq!(arch_for(8, 6), "compute_86"); // Ampere A10G / 30xx
142        assert_eq!(arch_for(8, 0), "compute_80"); // Ampere A100
143        assert_eq!(arch_for(7, 5), "compute_75"); // Turing T4 (the lead's box)
144        assert_eq!(arch_for(7, 0), "compute_70"); // Volta V100
145        assert_eq!(arch_for(6, 0), "compute_60"); // Pascal P100
146        // Minor variants round to the right major bucket.
147        assert_eq!(arch_for(8, 7), "compute_80"); // Orin -> 8.x bucket
148        assert_eq!(arch_for(7, 2), "compute_70"); // Xavier -> 7.x bucket
149    }
150
151    /// #1551 CRITICAL INVARIANT: every capability gam supports (CC >= 6.0) must
152    /// map to a virtual arch >= `compute_60`, the lowest arch that provides the
153    /// `atomicAdd(double*, double)` overload (added in CC 6.0). Below it the SAE
154    /// matvec kernels fail to NVRTC-compile and the device path silently declines.
155    #[test]
156    fn nvrtc_arch_never_below_double_atomic_floor() {
157        for &(major, minor) in &[
158            (6, 0),
159            (6, 1),
160            (7, 0),
161            (7, 5),
162            (8, 0),
163            (8, 6),
164            (8, 9),
165            (9, 0),
166        ] {
167            let n = arch_number(arch_for(major, minor));
168            assert!(
169                n >= 60,
170                "CC {major}.{minor} mapped to compute_{n}, below the double-atomicAdd \
171                 floor compute_60 — the SAE device PCG would silently fall back to CPU"
172            );
173        }
174    }
175
176    /// Newer-than-known majors must round DOWN to a toolkit-valid arch
177    /// (`compute_90`), never up to an arch the installed NVRTC cannot target
178    /// (which would itself fail to compile and decline the device path).
179    #[test]
180    fn nvrtc_arch_future_capabilities_round_down_to_known() {
181        assert_eq!(arch_for(10, 0), "compute_90");
182        assert_eq!(arch_for(12, 3), "compute_90");
183        // And it stays a valid, double-atomic-capable arch.
184        assert!(arch_number(arch_for(10, 0)) >= 60);
185    }
186
187    /// Sub-6.0 / unknown-low capabilities pin to `compute_60` — the lowest arch
188    /// that still carries the double atomicAdd, so a CC6-era toolkit accepts the
189    /// kernel source rather than declining.
190    #[test]
191    fn nvrtc_arch_below_floor_pins_to_compute_60() {
192        assert_eq!(arch_for(5, 0), "compute_60");
193        assert_eq!(arch_for(3, 5), "compute_60");
194    }
195}
gam_gpu/device.rs

gam_gpu/
device.rs