baracuda_driver/occupancy.rs
1//! Occupancy calculators — how many blocks can a kernel fit per SM, and
2//! what block size maximizes utilization.
3//!
4//! These are essential for kernel tuning: before launching, ask the driver
5//! "given this kernel and this dynamic-shared-memory budget, what's the
6//! best grid/block shape?" The results depend on the target device's
7//! register file, shared-memory size, and the kernel's own resource use.
8
9use baracuda_cuda_sys::driver;
10
11use crate::error::{check, Result};
12use crate::module::Function;
13
14/// How many blocks of `block_size` threads (using `dynamic_smem_bytes` of
15/// dynamic shared memory per block) can run concurrently on each SM of the
16/// current device.
17pub fn max_active_blocks_per_multiprocessor(
18 func: &Function,
19 block_size: i32,
20 dynamic_smem_bytes: usize,
21) -> Result<i32> {
22 let d = driver()?;
23 let cu = d.cu_occupancy_max_active_blocks_per_multiprocessor()?;
24 let mut n: core::ffi::c_int = 0;
25 // SAFETY: `func.as_raw()` is a live kernel handle; `&mut n` is writable.
26 check(unsafe { cu(&mut n, func.as_raw(), block_size, dynamic_smem_bytes) })?;
27 Ok(n)
28}
29
30/// As above, but with an explicit flag bitmask (see
31/// `CU_OCCUPANCY_*` in NVIDIA's headers). Passing `0` matches the
32/// no-flags version.
33pub fn max_active_blocks_per_multiprocessor_with_flags(
34 func: &Function,
35 block_size: i32,
36 dynamic_smem_bytes: usize,
37 flags: u32,
38) -> Result<i32> {
39 let d = driver()?;
40 let cu = d.cu_occupancy_max_active_blocks_per_multiprocessor_with_flags()?;
41 let mut n: core::ffi::c_int = 0;
42 check(unsafe { cu(&mut n, func.as_raw(), block_size, dynamic_smem_bytes, flags) })?;
43 Ok(n)
44}
45
46/// Block size that maximises occupancy for `func`, assuming the given
47/// fixed `dynamic_smem_bytes`. Returns `(min_grid_size, optimal_block_size)`:
48/// launch `min_grid_size` blocks of `optimal_block_size` threads to cover
49/// the device with peak SM-utilization.
50///
51/// `block_size_limit` clamps the returned block size; pass `0` for the
52/// device's documented maximum.
53pub fn max_potential_block_size(
54 func: &Function,
55 dynamic_smem_bytes: usize,
56 block_size_limit: i32,
57) -> Result<(i32, i32)> {
58 let d = driver()?;
59 let cu = d.cu_occupancy_max_potential_block_size()?;
60 let mut min_grid: core::ffi::c_int = 0;
61 let mut block: core::ffi::c_int = 0;
62 // SAFETY: both output pointers are writable; passing `None` as the
63 // variable-dynamic-smem-size callback means dynamic_smem_bytes is taken
64 // as a fixed value.
65 check(unsafe {
66 cu(
67 &mut min_grid,
68 &mut block,
69 func.as_raw(),
70 None,
71 dynamic_smem_bytes,
72 block_size_limit,
73 )
74 })?;
75 Ok((min_grid, block))
76}
77
78/// Given `num_blocks` concurrent blocks per SM with `block_size` threads
79/// each, how much dynamic shared memory (bytes) can each block still
80/// allocate without losing occupancy.
81///
82/// Useful for tiling kernels that grow their shared-memory usage up to the
83/// point occupancy drops.
84pub fn available_dynamic_smem_per_block(
85 func: &Function,
86 num_blocks: i32,
87 block_size: i32,
88) -> Result<usize> {
89 let d = driver()?;
90 let cu = d.cu_occupancy_available_dynamic_smem_per_block()?;
91 let mut bytes: usize = 0;
92 check(unsafe { cu(&mut bytes, func.as_raw(), num_blocks, block_size) })?;
93 Ok(bytes)
94}