Skip to main content

oxicuda_driver/
occupancy.rs

1//! GPU occupancy queries for performance optimisation.
2//!
3//! Occupancy measures how effectively GPU resources (warps, registers,
4//! shared memory) are utilised. These queries help select launch
5//! configurations that maximise hardware utilisation.
6//!
7//! # Example
8//!
9//! ```rust,no_run
10//! # use oxicuda_driver::module::Module;
11//! # fn main() -> Result<(), oxicuda_driver::error::CudaError> {
12//! # let module: Module = unimplemented!();
13//! let func = module.get_function("my_kernel")?;
14//!
15//! // Query the optimal block size for maximum occupancy.
16//! let (min_grid_size, optimal_block_size) = func.optimal_block_size(0)?;
17//! println!("optimal: grid >= {min_grid_size}, block = {optimal_block_size}");
18//!
19//! // Query active blocks per SM for a specific block size.
20//! let active = func.max_active_blocks_per_sm(256, 0)?;
21//! println!("active blocks per SM with 256 threads: {active}");
22//! # Ok(())
23//! # }
24//! ```
25
26use crate::error::CudaResult;
27use crate::loader::try_driver;
28use crate::module::Function;
29
30impl Function {
31    /// Returns the maximum number of active blocks per streaming
32    /// multiprocessor for a given block size and dynamic shared memory.
33    ///
34    /// This is useful for evaluating different block sizes to find
35    /// the configuration that achieves the highest occupancy.
36    ///
37    /// # Parameters
38    ///
39    /// * `block_size` — number of threads per block.
40    /// * `dynamic_smem` — dynamic shared memory per block in bytes
41    ///   (set to `0` if the kernel does not use dynamic shared memory).
42    ///
43    /// # Errors
44    ///
45    /// Returns a [`CudaError`](crate::error::CudaError) if the function
46    /// handle is invalid or the driver call fails.
47    pub fn max_active_blocks_per_sm(
48        &self,
49        block_size: i32,
50        dynamic_smem: usize,
51    ) -> CudaResult<i32> {
52        let api = try_driver()?;
53        let mut num_blocks: i32 = 0;
54        crate::cuda_call!((api.cu_occupancy_max_active_blocks_per_multiprocessor)(
55            &mut num_blocks,
56            self.raw(),
57            block_size,
58            dynamic_smem,
59        ))?;
60        Ok(num_blocks)
61    }
62
63    /// Suggests an optimal launch configuration that maximises
64    /// multiprocessor occupancy.
65    ///
66    /// Returns `(min_grid_size, optimal_block_size)` where:
67    ///
68    /// * `min_grid_size` — the minimum number of blocks needed to
69    ///   achieve maximum occupancy across all SMs.
70    /// * `optimal_block_size` — the block size (number of threads)
71    ///   that achieves maximum occupancy.
72    ///
73    /// # Parameters
74    ///
75    /// * `dynamic_smem` — dynamic shared memory per block in bytes
76    ///   (set to `0` if the kernel does not use dynamic shared memory).
77    ///
78    /// # Errors
79    ///
80    /// Returns a [`CudaError`](crate::error::CudaError) if the function
81    /// handle is invalid or the driver call fails.
82    pub fn optimal_block_size(&self, dynamic_smem: usize) -> CudaResult<(i32, i32)> {
83        let api = try_driver()?;
84        let mut min_grid_size: i32 = 0;
85        let mut block_size: i32 = 0;
86        crate::cuda_call!((api.cu_occupancy_max_potential_block_size)(
87            &mut min_grid_size,
88            &mut block_size,
89            self.raw(),
90            None, // no dynamic smem callback
91            dynamic_smem,
92            0, // no block size limit
93        ))?;
94        Ok((min_grid_size, block_size))
95    }
96}