Skip to main content

cuda_rust_wasm/runtime/
occupancy.rs

1//! Occupancy Calculator
2//!
3//! Predicts GPU occupancy (active warps / max warps) for a given kernel
4//! configuration, mirroring CUDA's `cudaOccupancyMaxActiveBlocksPerMultiprocessor`.
5//!
6//! Occupancy is limited by three resources:
7//! 1. Registers per thread
8//! 2. Shared memory per block
9//! 3. Threads per block (warp granularity)
10
11use std::fmt;
12
13/// GPU architecture specification.
14#[derive(Debug, Clone)]
15pub struct GpuArchSpec {
16    /// Compute capability name (e.g., "sm_90", "gfx1100").
17    pub name: String,
18    /// Max threads per SM/CU.
19    pub max_threads_per_sm: u32,
20    /// Max blocks (thread groups) per SM.
21    pub max_blocks_per_sm: u32,
22    /// Max warps per SM.
23    pub max_warps_per_sm: u32,
24    /// Warp size (32 for NVIDIA, 64 for AMD RDNA, 32 for AMD CDNA).
25    pub warp_size: u32,
26    /// Total registers per SM.
27    pub registers_per_sm: u32,
28    /// Register allocation granularity (registers are allocated in chunks).
29    pub register_alloc_granularity: u32,
30    /// Shared memory per SM (bytes).
31    pub shared_memory_per_sm: u32,
32    /// Shared memory allocation granularity (bytes).
33    pub shared_memory_alloc_granularity: u32,
34    /// Number of SMs/CUs on the device.
35    pub sm_count: u32,
36}
37
38impl GpuArchSpec {
39    /// NVIDIA Hopper (SM 9.0) — H100.
40    pub fn hopper() -> Self {
41        Self {
42            name: "sm_90".into(),
43            max_threads_per_sm: 2048,
44            max_blocks_per_sm: 32,
45            max_warps_per_sm: 64,
46            warp_size: 32,
47            registers_per_sm: 65536,
48            register_alloc_granularity: 256,
49            shared_memory_per_sm: 228 * 1024,
50            shared_memory_alloc_granularity: 256,
51            sm_count: 132,
52        }
53    }
54
55    /// NVIDIA Ada Lovelace (SM 8.9) — RTX 4090.
56    pub fn ada_lovelace() -> Self {
57        Self {
58            name: "sm_89".into(),
59            max_threads_per_sm: 1536,
60            max_blocks_per_sm: 24,
61            max_warps_per_sm: 48,
62            warp_size: 32,
63            registers_per_sm: 65536,
64            register_alloc_granularity: 256,
65            shared_memory_per_sm: 100 * 1024,
66            shared_memory_alloc_granularity: 256,
67            sm_count: 128,
68        }
69    }
70
71    /// NVIDIA Ampere (SM 8.0) — A100.
72    pub fn ampere() -> Self {
73        Self {
74            name: "sm_80".into(),
75            max_threads_per_sm: 2048,
76            max_blocks_per_sm: 32,
77            max_warps_per_sm: 64,
78            warp_size: 32,
79            registers_per_sm: 65536,
80            register_alloc_granularity: 256,
81            shared_memory_per_sm: 164 * 1024,
82            shared_memory_alloc_granularity: 128,
83            sm_count: 108,
84        }
85    }
86
87    /// AMD CDNA3 — MI300X.
88    pub fn cdna3() -> Self {
89        Self {
90            name: "gfx942".into(),
91            max_threads_per_sm: 2048,
92            max_blocks_per_sm: 32,
93            max_warps_per_sm: 32,
94            warp_size: 64, // AMD wavefront
95            registers_per_sm: 65536,
96            register_alloc_granularity: 256,
97            shared_memory_per_sm: 64 * 1024,
98            shared_memory_alloc_granularity: 256,
99            sm_count: 304,
100        }
101    }
102
103    /// Generic spec for WebGPU/software simulation.
104    pub fn generic() -> Self {
105        Self {
106            name: "generic".into(),
107            max_threads_per_sm: 1024,
108            max_blocks_per_sm: 16,
109            max_warps_per_sm: 32,
110            warp_size: 32,
111            registers_per_sm: 32768,
112            register_alloc_granularity: 256,
113            shared_memory_per_sm: 48 * 1024,
114            shared_memory_alloc_granularity: 256,
115            sm_count: 1,
116        }
117    }
118}
119
120/// Kernel resource requirements.
121#[derive(Debug, Clone)]
122pub struct KernelResources {
123    /// Threads per block (block size).
124    pub threads_per_block: u32,
125    /// Registers used per thread.
126    pub registers_per_thread: u32,
127    /// Static shared memory per block (bytes).
128    pub shared_memory_static: u32,
129    /// Dynamic shared memory per block (bytes).
130    pub shared_memory_dynamic: u32,
131}
132
133impl KernelResources {
134    /// Create with basic info.
135    pub fn new(threads_per_block: u32, registers_per_thread: u32, shared_memory: u32) -> Self {
136        Self {
137            threads_per_block,
138            registers_per_thread,
139            shared_memory_static: shared_memory,
140            shared_memory_dynamic: 0,
141        }
142    }
143
144    /// Total shared memory per block.
145    pub fn total_shared_memory(&self) -> u32 {
146        self.shared_memory_static + self.shared_memory_dynamic
147    }
148}
149
150/// Occupancy calculation result.
151#[derive(Debug, Clone)]
152pub struct OccupancyResult {
153    /// Active blocks per SM.
154    pub active_blocks_per_sm: u32,
155    /// Active warps per SM.
156    pub active_warps_per_sm: u32,
157    /// Max warps per SM (hardware limit).
158    pub max_warps_per_sm: u32,
159    /// Occupancy as a fraction (0.0 to 1.0).
160    pub occupancy: f64,
161    /// Which resource is the bottleneck.
162    pub limiting_factor: LimitingFactor,
163    /// Blocks limited by thread count.
164    pub blocks_limited_by_threads: u32,
165    /// Blocks limited by registers.
166    pub blocks_limited_by_registers: u32,
167    /// Blocks limited by shared memory.
168    pub blocks_limited_by_smem: u32,
169    /// Blocks limited by max-blocks-per-SM.
170    pub blocks_limited_by_max_blocks: u32,
171}
172
173/// Resource that limits occupancy.
174#[derive(Debug, Clone, Copy, PartialEq)]
175pub enum LimitingFactor {
176    Threads,
177    Registers,
178    SharedMemory,
179    MaxBlocksPerSm,
180}
181
182/// Calculate occupancy for a kernel on a given GPU architecture.
183pub fn calculate_occupancy(arch: &GpuArchSpec, kernel: &KernelResources) -> OccupancyResult {
184    let warp_size = arch.warp_size;
185
186    // Warps per block (round up)
187    let warps_per_block = (kernel.threads_per_block + warp_size - 1) / warp_size;
188
189    // 1. Thread limit
190    let blocks_by_threads = if warps_per_block > 0 {
191        arch.max_warps_per_sm / warps_per_block
192    } else {
193        0
194    };
195
196    // 2. Register limit
197    let regs_per_warp = kernel.registers_per_thread * warp_size;
198    let regs_per_warp_aligned = round_up(regs_per_warp, arch.register_alloc_granularity);
199    let regs_per_block = regs_per_warp_aligned * warps_per_block;
200    let blocks_by_registers = if regs_per_block > 0 {
201        arch.registers_per_sm / regs_per_block
202    } else {
203        arch.max_blocks_per_sm
204    };
205
206    // 3. Shared memory limit
207    let smem_per_block = kernel.total_shared_memory();
208    let smem_aligned = round_up(smem_per_block, arch.shared_memory_alloc_granularity);
209    let blocks_by_smem = if smem_aligned > 0 {
210        arch.shared_memory_per_sm / smem_aligned
211    } else {
212        arch.max_blocks_per_sm
213    };
214
215    // 4. Max blocks per SM limit
216    let blocks_by_max = arch.max_blocks_per_sm;
217
218    // Take the minimum
219    let active_blocks = blocks_by_threads
220        .min(blocks_by_registers)
221        .min(blocks_by_smem)
222        .min(blocks_by_max);
223
224    let active_warps = active_blocks * warps_per_block;
225    let occupancy = active_warps as f64 / arch.max_warps_per_sm as f64;
226
227    let limiting_factor = if active_blocks == blocks_by_threads {
228        LimitingFactor::Threads
229    } else if active_blocks == blocks_by_registers {
230        LimitingFactor::Registers
231    } else if active_blocks == blocks_by_smem {
232        LimitingFactor::SharedMemory
233    } else {
234        LimitingFactor::MaxBlocksPerSm
235    };
236
237    OccupancyResult {
238        active_blocks_per_sm: active_blocks,
239        active_warps_per_sm: active_warps,
240        max_warps_per_sm: arch.max_warps_per_sm,
241        occupancy,
242        limiting_factor,
243        blocks_limited_by_threads: blocks_by_threads,
244        blocks_limited_by_registers: blocks_by_registers,
245        blocks_limited_by_smem: blocks_by_smem,
246        blocks_limited_by_max_blocks: blocks_by_max,
247    }
248}
249
250/// Suggest optimal block size for maximum occupancy.
251pub fn suggest_block_size(arch: &GpuArchSpec, registers_per_thread: u32, shared_memory: u32) -> BlockSizeSuggestion {
252    let mut best_occupancy = 0.0;
253    let mut best_block_size = arch.warp_size;
254    let mut results = Vec::new();
255
256    // Try block sizes from 1 warp to max
257    let max_block = arch.max_threads_per_sm.min(1024);
258    let mut block_size = arch.warp_size;
259
260    while block_size <= max_block {
261        let kernel = KernelResources::new(block_size, registers_per_thread, shared_memory);
262        let result = calculate_occupancy(arch, &kernel);
263
264        results.push((block_size, result.occupancy));
265
266        if result.occupancy > best_occupancy {
267            best_occupancy = result.occupancy;
268            best_block_size = block_size;
269        }
270        block_size += arch.warp_size;
271    }
272
273    BlockSizeSuggestion {
274        optimal_block_size: best_block_size,
275        max_occupancy: best_occupancy,
276        all_results: results,
277    }
278}
279
280/// Block size suggestion result.
281#[derive(Debug)]
282pub struct BlockSizeSuggestion {
283    pub optimal_block_size: u32,
284    pub max_occupancy: f64,
285    pub all_results: Vec<(u32, f64)>,
286}
287
288fn round_up(value: u32, granularity: u32) -> u32 {
289    if granularity == 0 { return value; }
290    ((value + granularity - 1) / granularity) * granularity
291}
292
293impl fmt::Display for OccupancyResult {
294    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
295        write!(f, "Occupancy: {:.1}% ({}/{} warps, {} blocks/SM, limited by {:?})",
296            self.occupancy * 100.0,
297            self.active_warps_per_sm,
298            self.max_warps_per_sm,
299            self.active_blocks_per_sm,
300            self.limiting_factor)
301    }
302}
303
304// ── Tests ──────────────────────────────────────────────────────────
305
306#[cfg(test)]
307mod tests {
308    use super::*;
309
310    #[test]
311    fn test_occupancy_basic() {
312        let arch = GpuArchSpec::ampere();
313        let kernel = KernelResources::new(256, 32, 0);
314        let result = calculate_occupancy(&arch, &kernel);
315
316        assert!(result.occupancy > 0.0);
317        assert!(result.occupancy <= 1.0);
318        assert!(result.active_blocks_per_sm > 0);
319    }
320
321    #[test]
322    fn test_occupancy_register_limited() {
323        let arch = GpuArchSpec::ampere();
324        // High register usage: 128 regs/thread, 256 threads
325        let kernel = KernelResources::new(256, 128, 0);
326        let result = calculate_occupancy(&arch, &kernel);
327
328        assert!(result.occupancy < 1.0);
329        // With 128 regs * 256 threads = 32768 regs per block
330        // Ampere has 65536 regs/SM → 2 blocks
331        assert!(result.active_blocks_per_sm <= 2);
332    }
333
334    #[test]
335    fn test_occupancy_smem_limited() {
336        let arch = GpuArchSpec::ampere();
337        // Large shared memory: 48KB per block
338        let kernel = KernelResources::new(256, 32, 48 * 1024);
339        let result = calculate_occupancy(&arch, &kernel);
340
341        // Ampere: 164KB smem → ~3 blocks with 48KB each
342        assert!(result.active_blocks_per_sm <= 4);
343    }
344
345    #[test]
346    fn test_occupancy_full() {
347        let arch = GpuArchSpec::ampere();
348        // Small kernel: should achieve near 100%
349        let kernel = KernelResources::new(64, 16, 0);
350        let result = calculate_occupancy(&arch, &kernel);
351        assert!(result.occupancy >= 0.5, "Expected high occupancy, got {}", result.occupancy);
352    }
353
354    #[test]
355    fn test_suggest_block_size() {
356        let arch = GpuArchSpec::ampere();
357        let suggestion = suggest_block_size(&arch, 32, 0);
358
359        assert!(suggestion.optimal_block_size >= 32);
360        assert!(suggestion.optimal_block_size <= 1024);
361        assert!(suggestion.max_occupancy > 0.0);
362        assert!(!suggestion.all_results.is_empty());
363    }
364
365    #[test]
366    fn test_hopper_arch() {
367        let arch = GpuArchSpec::hopper();
368        let kernel = KernelResources::new(256, 32, 0);
369        let result = calculate_occupancy(&arch, &kernel);
370        // Hopper: 2048 threads, 64 warps → 256 threads = 8 warps per block
371        // 64/8 = 8 blocks (thread-limited)
372        assert!(result.active_blocks_per_sm > 0);
373        assert!(result.occupancy > 0.0);
374    }
375
376    #[test]
377    fn test_amd_cdna3() {
378        let arch = GpuArchSpec::cdna3();
379        let kernel = KernelResources::new(256, 32, 0);
380        let result = calculate_occupancy(&arch, &kernel);
381        // AMD warp size = 64 → 256/64 = 4 warps per block
382        assert!(result.active_blocks_per_sm > 0);
383    }
384
385    #[test]
386    fn test_occupancy_display() {
387        let result = OccupancyResult {
388            active_blocks_per_sm: 8,
389            active_warps_per_sm: 64,
390            max_warps_per_sm: 64,
391            occupancy: 1.0,
392            limiting_factor: LimitingFactor::Threads,
393            blocks_limited_by_threads: 8,
394            blocks_limited_by_registers: 16,
395            blocks_limited_by_smem: 32,
396            blocks_limited_by_max_blocks: 32,
397        };
398        let s = format!("{}", result);
399        assert!(s.contains("100.0%"));
400        assert!(s.contains("Threads"));
401    }
402
403    #[test]
404    fn test_dynamic_shared_memory() {
405        let mut kernel = KernelResources::new(256, 32, 1024);
406        kernel.shared_memory_dynamic = 2048;
407        assert_eq!(kernel.total_shared_memory(), 3072);
408
409        let arch = GpuArchSpec::ampere();
410        let result = calculate_occupancy(&arch, &kernel);
411        assert!(result.occupancy > 0.0);
412    }
413}