baracuda_driver/
occupancy.rs

1//! Occupancy calculators — how many blocks can a kernel fit per SM, and
2//! what block size maximizes utilization.
3//!
4//! These are essential for kernel tuning: before launching, ask the driver
5//! "given this kernel and this dynamic-shared-memory budget, what's the
6//! best grid/block shape?" The results depend on the target device's
7//! register file, shared-memory size, and the kernel's own resource use.
8
9use baracuda_cuda_sys::driver;
10
11use crate::error::{check, Result};
12use crate::module::Function;
13
14/// How many blocks of `block_size` threads (using `dynamic_smem_bytes` of
15/// dynamic shared memory per block) can run concurrently on each SM of the
16/// current device.
17pub fn max_active_blocks_per_multiprocessor(
18    func: &Function,
19    block_size: i32,
20    dynamic_smem_bytes: usize,
21) -> Result<i32> {
22    let d = driver()?;
23    let cu = d.cu_occupancy_max_active_blocks_per_multiprocessor()?;
24    let mut n: core::ffi::c_int = 0;
25    // SAFETY: `func.as_raw()` is a live kernel handle; `&mut n` is writable.
26    check(unsafe { cu(&mut n, func.as_raw(), block_size, dynamic_smem_bytes) })?;
27    Ok(n)
28}
29
30/// As above, but with an explicit flag bitmask (see
31/// `CU_OCCUPANCY_*` in NVIDIA's headers). Passing `0` matches the
32/// no-flags version.
33pub fn max_active_blocks_per_multiprocessor_with_flags(
34    func: &Function,
35    block_size: i32,
36    dynamic_smem_bytes: usize,
37    flags: u32,
38) -> Result<i32> {
39    let d = driver()?;
40    let cu = d.cu_occupancy_max_active_blocks_per_multiprocessor_with_flags()?;
41    let mut n: core::ffi::c_int = 0;
42    check(unsafe { cu(&mut n, func.as_raw(), block_size, dynamic_smem_bytes, flags) })?;
43    Ok(n)
44}
45
46/// Block size that maximises occupancy for `func`, assuming the given
47/// fixed `dynamic_smem_bytes`. Returns `(min_grid_size, optimal_block_size)`:
48/// launch `min_grid_size` blocks of `optimal_block_size` threads to cover
49/// the device with peak SM-utilization.
50///
51/// `block_size_limit` clamps the returned block size; pass `0` for the
52/// device's documented maximum.
53pub fn max_potential_block_size(
54    func: &Function,
55    dynamic_smem_bytes: usize,
56    block_size_limit: i32,
57) -> Result<(i32, i32)> {
58    let d = driver()?;
59    let cu = d.cu_occupancy_max_potential_block_size()?;
60    let mut min_grid: core::ffi::c_int = 0;
61    let mut block: core::ffi::c_int = 0;
62    // SAFETY: both output pointers are writable; passing `None` as the
63    // variable-dynamic-smem-size callback means dynamic_smem_bytes is taken
64    // as a fixed value.
65    check(unsafe {
66        cu(
67            &mut min_grid,
68            &mut block,
69            func.as_raw(),
70            None,
71            dynamic_smem_bytes,
72            block_size_limit,
73        )
74    })?;
75    Ok((min_grid, block))
76}
77
78/// Given `num_blocks` concurrent blocks per SM with `block_size` threads
79/// each, how much dynamic shared memory (bytes) can each block still
80/// allocate without losing occupancy.
81///
82/// Useful for tiling kernels that grow their shared-memory usage up to the
83/// point occupancy drops.
84pub fn available_dynamic_smem_per_block(
85    func: &Function,
86    num_blocks: i32,
87    block_size: i32,
88) -> Result<usize> {
89    let d = driver()?;
90    let cu = d.cu_occupancy_available_dynamic_smem_per_block()?;
91    let mut bytes: usize = 0;
92    check(unsafe { cu(&mut bytes, func.as_raw(), num_blocks, block_size) })?;
93    Ok(bytes)
94}
baracuda_driver/occupancy.rs

baracuda_driver/
occupancy.rs