oxicuda_driver/occupancy.rs
1//! GPU occupancy queries for performance optimisation.
2//!
3//! Occupancy measures how effectively GPU resources (warps, registers,
4//! shared memory) are utilised. These queries help select launch
5//! configurations that maximise hardware utilisation.
6//!
7//! # Example
8//!
9//! ```rust,no_run
10//! # use oxicuda_driver::module::Module;
11//! # fn main() -> Result<(), oxicuda_driver::error::CudaError> {
12//! # let module: Module = unimplemented!();
13//! let func = module.get_function("my_kernel")?;
14//!
15//! // Query the optimal block size for maximum occupancy.
16//! let (min_grid_size, optimal_block_size) = func.optimal_block_size(0)?;
17//! println!("optimal: grid >= {min_grid_size}, block = {optimal_block_size}");
18//!
19//! // Query active blocks per SM for a specific block size.
20//! let active = func.max_active_blocks_per_sm(256, 0)?;
21//! println!("active blocks per SM with 256 threads: {active}");
22//! # Ok(())
23//! # }
24//! ```
25
26use crate::error::CudaResult;
27use crate::loader::try_driver;
28use crate::module::Function;
29
30impl Function {
31 /// Returns the maximum number of active blocks per streaming
32 /// multiprocessor for a given block size and dynamic shared memory.
33 ///
34 /// This is useful for evaluating different block sizes to find
35 /// the configuration that achieves the highest occupancy.
36 ///
37 /// # Parameters
38 ///
39 /// * `block_size` — number of threads per block.
40 /// * `dynamic_smem` — dynamic shared memory per block in bytes
41 /// (set to `0` if the kernel does not use dynamic shared memory).
42 ///
43 /// # Errors
44 ///
45 /// Returns a [`CudaError`](crate::error::CudaError) if the function
46 /// handle is invalid or the driver call fails.
47 pub fn max_active_blocks_per_sm(
48 &self,
49 block_size: i32,
50 dynamic_smem: usize,
51 ) -> CudaResult<i32> {
52 let api = try_driver()?;
53 let mut num_blocks: i32 = 0;
54 crate::cuda_call!((api.cu_occupancy_max_active_blocks_per_multiprocessor)(
55 &mut num_blocks,
56 self.raw(),
57 block_size,
58 dynamic_smem,
59 ))?;
60 Ok(num_blocks)
61 }
62
63 /// Suggests an optimal launch configuration that maximises
64 /// multiprocessor occupancy.
65 ///
66 /// Returns `(min_grid_size, optimal_block_size)` where:
67 ///
68 /// * `min_grid_size` — the minimum number of blocks needed to
69 /// achieve maximum occupancy across all SMs.
70 /// * `optimal_block_size` — the block size (number of threads)
71 /// that achieves maximum occupancy.
72 ///
73 /// # Parameters
74 ///
75 /// * `dynamic_smem` — dynamic shared memory per block in bytes
76 /// (set to `0` if the kernel does not use dynamic shared memory).
77 ///
78 /// # Errors
79 ///
80 /// Returns a [`CudaError`](crate::error::CudaError) if the function
81 /// handle is invalid or the driver call fails.
82 pub fn optimal_block_size(&self, dynamic_smem: usize) -> CudaResult<(i32, i32)> {
83 let api = try_driver()?;
84 let mut min_grid_size: i32 = 0;
85 let mut block_size: i32 = 0;
86 crate::cuda_call!((api.cu_occupancy_max_potential_block_size)(
87 &mut min_grid_size,
88 &mut block_size,
89 self.raw(),
90 None, // no dynamic smem callback
91 dynamic_smem,
92 0, // no block size limit
93 ))?;
94 Ok((min_grid_size, block_size))
95 }
96}