singe-cuda 0.1.0-alpha.4

#[allow(unused_imports)]
use crate::error::ErrorCode;

use std::{
    borrow::Cow,
    ffi::CString,
    fmt::{self, Display, Formatter},
    ptr,
    sync::Arc,
};

use singe_cuda_sys::driver;

use crate::{
    context::Context,
    dim::Dim3,
    error::{Error, Result},
    graph::{ExecutableGraph, Graph, GraphNode, KernelNodeParams},
    kernel::{self, ModuleKernelHandle},
    stream::Stream,
    try_cuda,
    types::{DeviceFunction, FunctionAttribute, SharedMemoryCarveout},
};

bitflags::bitflags! {
    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
    pub struct OccupancyFlags: u32 {
        const DEFAULT = driver::CUoccupancy_flags::CU_OCCUPANCY_DEFAULT as _;
        const DISABLE_CACHING_OVERRIDE = driver::CUoccupancy_flags::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE as _;
    }
}

impl Display for OccupancyFlags {
    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
        if self.is_empty() {
            return Ok(());
        }
        let mut first = true;
        let write_sep = |f: &mut Formatter<'_>, first: &mut bool, name: &str| -> fmt::Result {
            if *first {
                *first = false;
            } else {
                f.write_str(" | ")?;
            }
            f.write_str(name)
        };

        if self.contains(Self::DEFAULT) {
            write_sep(f, &mut first, "CU_OCCUPANCY_DEFAULT")?;
        }
        if self.contains(Self::DISABLE_CACHING_OVERRIDE) {
            write_sep(f, &mut first, "CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE")?;
        }

        Ok(())
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct FunctionAttributes {
    pub shared_size_bytes: usize,
    pub const_size_bytes: usize,
    pub local_size_bytes: usize,
    pub max_threads_per_block: i32,
    pub num_regs: i32,
    pub ptx_version: i32,
    pub binary_version: i32,
    pub cache_mode_ca: bool,
    pub max_dynamic_shared_size_bytes: i32,
    pub preferred_shared_memory_carveout: i32,
    pub cluster_dim_must_be_set: bool,
    pub required_cluster_width: i32,
    pub required_cluster_height: i32,
    pub required_cluster_depth: i32,
    pub cluster_scheduling_policy_preference: i32,
    pub non_portable_cluster_size_allowed: bool,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct OccupancyMaxPotentialBlockSize {
    pub min_grid_size: i32,
    pub block_size: i32,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct ClusterLaunchConfig {
    pub grid_dim: Dim3,
    pub block_dim: Dim3,
    pub shared_memory_bytes: usize,
}

#[derive(Debug)]
pub struct Module {
    handle: driver::CUmodule,
    ctx: Arc<Context>,
    owns_handle: bool,
}

#[derive(Debug, Clone, Copy)]
pub struct Global<'a> {
    ptr: *mut (),
    size: usize,
    _module: &'a Module,
}

#[derive(Debug, Clone, Copy)]
pub struct TextureReference<'a> {
    handle: driver::CUtexref,
    _module: &'a Module,
}

#[derive(Debug, Clone, Copy)]
pub struct SurfaceReference<'a> {
    handle: driver::CUsurfref,
    _module: &'a Module,
}

#[derive(Debug, Clone)]
pub struct ModuleImage<'a> {
    data: Cow<'a, [u8]>,
}

#[derive(Debug)]
pub struct KernelFunction<'a> {
    handle: DeviceFunction,
    module: &'a Module,
}

#[derive(Debug, Clone)]
pub struct LaunchConfig {
    pub grid_dim: Dim3,
    pub block_dim: Dim3,
    pub shared_memory_bytes: usize,
}

#[derive(Debug)]
pub struct KernelParameters {
    arguments: Vec<*mut ()>,
}

impl Module {
    pub const unsafe fn from_raw(handle: driver::CUmodule, ctx: Arc<Context>) -> Self {
        Self {
            handle,
            ctx,
            owns_handle: true,
        }
    }

    pub const unsafe fn from_borrowed_raw(handle: driver::CUmodule, ctx: Arc<Context>) -> Self {
        Self {
            handle,
            ctx,
            owns_handle: false,
        }
    }

    /// Returns the handle of the function with the given name located in the module.
    /// If no function of that name exists, [`Module::function`] returns [`ErrorCode::NotFound`].
    ///
    /// Note:
    ///
    /// Note that this function may also return error codes from previous, asynchronous launches.
    pub fn function(&self, name: &str) -> Result<KernelFunction<'_>> {
        unsafe {
            let c_name = CString::new(name)?;
            let mut function_handle = ptr::null_mut();
            try_cuda!(driver::cuModuleGetFunction(
                &raw mut function_handle,
                self.handle,
                c_name.as_ptr(),
            ))?;
            if function_handle.is_null() {
                return Err(Error::NullHandle);
            }
            Ok(KernelFunction::from_raw(function_handle.into(), self))
        }
    }

    /// Returns the number of functions in this module.
    pub fn function_count(&self) -> Result<usize> {
        unsafe {
            let mut count = 0;
            try_cuda!(driver::cuModuleGetFunctionCount(
                &raw mut count,
                self.handle
            ))?;
            Ok(count as usize)
        }
    }

    pub const unsafe fn as_raw(&self) -> driver::CUmodule {
        self.handle
    }

    /// Returns the base pointer and size of the global with the given name located in the module.
    /// If no variable of that name exists, [`Module::global`] returns [`ErrorCode::NotFound`].
    /// Note:
    ///
    /// Note that this function may also return error codes from previous, asynchronous launches.
    pub fn global(&self, name: &str) -> Result<Global<'_>> {
        let c_name = CString::new(name)?;
        let mut ptr = 0;
        let mut size = 0;
        self.ctx.bind()?;
        unsafe {
            try_cuda!(driver::cuModuleGetGlobal_v2(
                &raw mut ptr,
                &raw mut size,
                self.handle,
                c_name.as_ptr(),
            ))?;
        }
        Ok(Global {
            ptr: ptr as _,
            size: size as _,
            _module: self,
        })
    }
}

impl Drop for Module {
    fn drop(&mut self) {
        if !self.owns_handle {
            return;
        }

        if let Err(err) = self.ctx.bind() {
            #[cfg(debug_assertions)]
            eprintln!("failed to bind context before unloading module: {err}");
            return;
        }

        unsafe {
            if let Err(err) = try_cuda!(driver::cuModuleUnload(self.handle)) {
                #[cfg(debug_assertions)]
                eprintln!("failed to unload cuda module: {err}");
            }
        }
    }
}

unsafe impl Send for Module {}

unsafe impl Sync for Module {}

impl<'a> ModuleImage<'a> {
    pub const fn new(data: &'a [u8]) -> Self {
        Self {
            data: Cow::Borrowed(data),
        }
    }

    pub fn from_vec(data: Vec<u8>) -> Self {
        Self {
            data: Cow::Owned(data),
        }
    }

    pub fn from_string(data: String) -> Self {
        Self::from_vec(data.into_bytes())
    }

    pub fn as_ptr(&self) -> *const () {
        self.data.as_ptr().cast()
    }

    pub fn as_bytes(&self) -> &[u8] {
        self.data.as_ref()
    }
}

impl Global<'_> {
    pub const fn as_ptr(&self) -> *mut () {
        self.ptr
    }

    pub const fn size(&self) -> usize {
        self.size
    }
}

impl TextureReference<'_> {
    pub const unsafe fn as_raw(&self) -> driver::CUtexref {
        self.handle
    }
}

impl SurfaceReference<'_> {
    pub const unsafe fn as_raw(&self) -> driver::CUsurfref {
        self.handle
    }
}

impl KernelFunction<'_> {
    pub const unsafe fn from_raw(handle: DeviceFunction, module: &Module) -> KernelFunction<'_> {
        KernelFunction { handle, module }
    }

    /// Invokes this kernel function on a grid of blocks.
    /// Each block contains the threads specified by [`LaunchConfig::block_dim`].
    ///
    /// [`LaunchConfig::shared_memory_bytes`] sets the amount of dynamic shared memory that will be available to each thread block.
    ///
    /// Kernel parameters are passed with [`KernelParameters`].
    ///
    /// Launching the kernel invalidates the persistent function state set through the following deprecated APIs: [`sys::cuFuncSetBlockShape`](singe_cuda_sys::driver::cuFuncSetBlockShape), [`sys::cuFuncSetSharedSize`](singe_cuda_sys::driver::cuFuncSetSharedSize), [`sys::cuParamSetSize`](singe_cuda_sys::driver::cuParamSetSize), [`sys::cuParamSeti`](singe_cuda_sys::driver::cuParamSeti), [`sys::cuParamSetf`](singe_cuda_sys::driver::cuParamSetf), [`sys::cuParamSetv`](singe_cuda_sys::driver::cuParamSetv).
    ///
    /// The kernel must either have been compiled with toolchain version 3.2 or later so that it contains kernel parameter information, or have no kernel parameters.
    /// If either of these conditions is not met, the launch returns [`ErrorCode::InvalidImage`].
    ///
    /// Note:
    ///
    /// * This function uses standard `default stream` semantics.
    /// * Note that this function may also return error codes from previous, asynchronous launches.
    pub fn launch(&self, config: &LaunchConfig, mut params: KernelParameters) -> Result<()> {
        self.module.ctx.bind()?;
        unsafe {
            try_cuda!(driver::cuLaunchKernel(
                self.handle.as_raw(),
                config.grid_dim.x,
                config.grid_dim.y,
                config.grid_dim.z,
                config.block_dim.x,
                config.block_dim.y,
                config.block_dim.z,
                config.shared_memory_bytes as _,
                ptr::null_mut(),
                params.arguments.as_mut_ptr().cast(),
                ptr::null_mut(),
            ))?;
        }
        Ok(())
    }

    /// Invokes this kernel function on a grid of blocks using the given stream.
    /// Each block contains the threads specified by [`LaunchConfig::block_dim`].
    ///
    /// [`LaunchConfig::shared_memory_bytes`] sets the amount of dynamic shared memory that will be available to each thread block.
    ///
    /// Kernel parameters are passed with [`KernelParameters`].
    ///
    /// Launching the kernel invalidates the persistent function state set through the following deprecated APIs: [`sys::cuFuncSetBlockShape`](singe_cuda_sys::driver::cuFuncSetBlockShape), [`sys::cuFuncSetSharedSize`](singe_cuda_sys::driver::cuFuncSetSharedSize), [`sys::cuParamSetSize`](singe_cuda_sys::driver::cuParamSetSize), [`sys::cuParamSeti`](singe_cuda_sys::driver::cuParamSeti), [`sys::cuParamSetf`](singe_cuda_sys::driver::cuParamSetf), [`sys::cuParamSetv`](singe_cuda_sys::driver::cuParamSetv).
    ///
    /// The kernel must either have been compiled with toolchain version 3.2 or later so that it contains kernel parameter information, or have no kernel parameters.
    /// If either of these conditions is not met, the launch returns [`ErrorCode::InvalidImage`].
    ///
    /// Note:
    ///
    /// * This function uses standard `default stream` semantics.
    /// * Note that this function may also return error codes from previous, asynchronous launches.
    pub fn launch_on(
        &self,
        config: &LaunchConfig,
        mut params: KernelParameters,
        stream: &Stream,
    ) -> Result<()> {
        if stream.context() != self.module.ctx.as_ref() {
            return Err(driver::CUresult::CUDA_ERROR_INVALID_CONTEXT.into());
        }

        self.module.ctx.bind()?;
        unsafe {
            try_cuda!(driver::cuLaunchKernel(
                self.handle.as_raw(),
                config.grid_dim.x,
                config.grid_dim.y,
                config.grid_dim.z,
                config.block_dim.x,
                config.block_dim.y,
                config.block_dim.z,
                config.shared_memory_bytes as _,
                stream.as_raw(),
                params.arguments.as_mut_ptr().cast(),
                ptr::null_mut(),
            ))?;
        }
        Ok(())
    }

    pub fn add_to_graph(
        &self,
        graph: &mut Graph,
        dependencies: &[GraphNode],
        config: &LaunchConfig,
        params: &mut KernelParameters,
    ) -> Result<GraphNode> {
        // CUDA copies kernel argument values while creating the graph node, so
        // the borrowed `KernelParameters` only need to stay alive for this call.
        let node_params = KernelNodeParams::new(self.handle, config.grid_dim, config.block_dim)
            .with_shared_mem_bytes(config.shared_memory_bytes)
            .with_kernel_params(params.as_mut_ptr());
        graph.add_kernel_node(dependencies, &node_params)
    }

    pub fn set_graph_node_params(
        &self,
        executable: &mut ExecutableGraph,
        node: GraphNode,
        config: &LaunchConfig,
        params: &mut KernelParameters,
    ) -> Result<()> {
        let node_params = KernelNodeParams::new(self.handle, config.grid_dim, config.block_dim)
            .with_shared_mem_bytes(config.shared_memory_bytes)
            .with_kernel_params(params.as_mut_ptr());
        executable.set_kernel_node_params(node, &node_params)
    }

    pub const fn module(&self) -> &Module {
        self.module
    }

    pub fn name(&self) -> Result<String> {
        kernel::name::<ModuleKernelHandle>(self.module.ctx.as_ref(), self.handle.as_raw())
    }

    pub fn attribute(&self, attribute: FunctionAttribute) -> Result<i32> {
        kernel::attribute::<ModuleKernelHandle>(
            self.module.ctx.as_ref(),
            self.handle.as_raw(),
            attribute,
        )
    }

    pub fn set_attribute(&self, attribute: FunctionAttribute, value: i32) -> Result<()> {
        kernel::set_attribute::<ModuleKernelHandle>(
            self.module.ctx.as_ref(),
            self.handle.as_raw(),
            attribute,
            value,
        )
    }

    pub fn set_max_dynamic_shared_memory_bytes(&self, bytes: i32) -> Result<()> {
        self.set_attribute(FunctionAttribute::MaxDynamicSharedSizeBytes, bytes)
    }

    pub fn set_preferred_shared_memory_carveout(
        &self,
        carveout: SharedMemoryCarveout,
    ) -> Result<()> {
        self.set_attribute(
            FunctionAttribute::PreferredSharedMemoryCarveout,
            i32::from(carveout),
        )
    }

    pub fn attributes(&self) -> Result<FunctionAttributes> {
        Ok(FunctionAttributes {
            shared_size_bytes: self.attribute(FunctionAttribute::SharedSizeBytes)? as usize,
            const_size_bytes: self.attribute(FunctionAttribute::ConstSizeBytes)? as usize,
            local_size_bytes: self.attribute(FunctionAttribute::LocalSizeBytes)? as usize,
            max_threads_per_block: self.attribute(FunctionAttribute::MaxThreadsPerBlock)?,
            num_regs: self.attribute(FunctionAttribute::NumRegs)?,
            ptx_version: self.attribute(FunctionAttribute::PtxVersion)?,
            binary_version: self.attribute(FunctionAttribute::BinaryVersion)?,
            cache_mode_ca: self.attribute(FunctionAttribute::CacheModeCa)? != 0,
            max_dynamic_shared_size_bytes: self
                .attribute(FunctionAttribute::MaxDynamicSharedSizeBytes)?,
            preferred_shared_memory_carveout: self
                .attribute(FunctionAttribute::PreferredSharedMemoryCarveout)?,
            cluster_dim_must_be_set: self.attribute(FunctionAttribute::ClusterSizeMustBeSet)? != 0,
            required_cluster_width: self.attribute(FunctionAttribute::RequiredClusterWidth)?,
            required_cluster_height: self.attribute(FunctionAttribute::RequiredClusterHeight)?,
            required_cluster_depth: self.attribute(FunctionAttribute::RequiredClusterDepth)?,
            cluster_scheduling_policy_preference: self
                .attribute(FunctionAttribute::ClusterSchedulingPolicyPreference)?,
            non_portable_cluster_size_allowed: self
                .attribute(FunctionAttribute::NonPortableClusterSizeAllowed)?
                != 0,
        })
    }

    pub fn occupancy_max_active_blocks_per_multiprocessor(
        &self,
        block_size: i32,
        dynamic_shared_memory_bytes: usize,
    ) -> Result<i32> {
        self.occupancy_max_active_blocks_per_multiprocessor_with_flags(
            block_size,
            dynamic_shared_memory_bytes,
            OccupancyFlags::DEFAULT,
        )
    }

    /// Returns the maximum number of active blocks per streaming multiprocessor.
    ///
    /// The `flags` parameter controls how special cases are handled.
    /// The valid flags are:
    ///
    /// * [`OccupancyFlags::DEFAULT`], which maintains the default behavior as [`sys::cuOccupancyMaxActiveBlocksPerMultiprocessor`](singe_cuda_sys::driver::cuOccupancyMaxActiveBlocksPerMultiprocessor);
    ///
    /// * [`OccupancyFlags::DISABLE_CACHING_OVERRIDE`], which suppresses the default behavior on platform where global caching affects occupancy.
    ///    On such platforms, if caching
    ///    is enabled, but per-block SM resource usage would result in zero occupancy, the occupancy calculator will calculate the occupancy
    ///    as if caching is disabled.
    ///    Setting [`OccupancyFlags::DISABLE_CACHING_OVERRIDE`] makes the occupancy calculator return 0 in such cases.
    ///    More information can be found about this feature in the "Unified
    ///    L1/Texture Cache" section of the Maxwell tuning guide.
    ///
    /// The underlying CUDA API can also be used with a context-less kernel queried via [`Library::kernel`](crate::library::Library::kernel).
    /// Here, this wrapper uses the current context for calculations.
    ///
    /// Note:
    ///
    /// Note that this function may also return error codes from previous, asynchronous launches.
    pub fn occupancy_max_active_blocks_per_multiprocessor_with_flags(
        &self,
        block_size: i32,
        dynamic_shared_memory_bytes: usize,
        flags: OccupancyFlags,
    ) -> Result<i32> {
        self.module.ctx.bind()?;
        let mut blocks = 0;
        unsafe {
            try_cuda!(
                driver::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
                    &raw mut blocks,
                    self.handle.as_raw(),
                    block_size,
                    dynamic_shared_memory_bytes as _,
                    flags.bits(),
                )
            )?;
        }
        Ok(blocks)
    }

    /// Returns dynamic shared memory available per block when launching `num_blocks` blocks on a streaming multiprocessor.
    ///
    /// The returned value is the maximum size of dynamic shared memory that allows `num_blocks` blocks per streaming multiprocessor.
    ///
    /// The underlying CUDA API can also be used with a context-less kernel queried via [`Library::kernel`](crate::library::Library::kernel).
    /// Here, this wrapper uses the current context for calculations.
    ///
    /// Note:
    ///
    /// Note that this function may also return error codes from previous, asynchronous launches.
    pub fn occupancy_available_dynamic_shared_memory_per_block(
        &self,
        num_blocks: i32,
        block_size: i32,
    ) -> Result<usize> {
        self.module.ctx.bind()?;
        let mut bytes = 0;
        unsafe {
            try_cuda!(driver::cuOccupancyAvailableDynamicSMemPerBlock(
                &raw mut bytes,
                self.handle.as_raw(),
                num_blocks,
                block_size,
            ))?;
        }
        Ok(bytes as usize)
    }

    pub fn occupancy_max_potential_block_size(
        &self,
        dynamic_shared_memory_bytes: usize,
        block_size_limit: i32,
    ) -> Result<OccupancyMaxPotentialBlockSize> {
        self.occupancy_max_potential_block_size_with_flags(
            dynamic_shared_memory_bytes,
            block_size_limit,
            OccupancyFlags::DEFAULT,
        )
    }

    /// An extended version of [`sys::cuOccupancyMaxPotentialBlockSize`](singe_cuda_sys::driver::cuOccupancyMaxPotentialBlockSize).
    /// In addition to arguments passed to [`sys::cuOccupancyMaxPotentialBlockSize`](singe_cuda_sys::driver::cuOccupancyMaxPotentialBlockSize), [`KernelFunction::occupancy_max_potential_block_size_with_flags`] also takes a `flags` parameter.
    ///
    /// The `flags` parameter controls how special cases are handled.
    /// The valid flags are:
    ///
    /// * [`OccupancyFlags::DEFAULT`], which maintains the default behavior as [`sys::cuOccupancyMaxPotentialBlockSize`](singe_cuda_sys::driver::cuOccupancyMaxPotentialBlockSize);
    ///
    /// * [`OccupancyFlags::DISABLE_CACHING_OVERRIDE`], which suppresses the default behavior on platform where global caching affects occupancy.
    ///    On such platforms, the launch
    ///    configurations that produces maximal occupancy might not support global caching.
    ///    Setting [`OccupancyFlags::DISABLE_CACHING_OVERRIDE`] guarantees that the produced launch configuration is global caching compatible at a potential cost of occupancy.
    ///    More
    ///    information can be found about this feature in the "Unified L1/Texture Cache" section of the Maxwell tuning guide.
    ///
    /// The underlying CUDA API can also be used with a context-less kernel queried via [`Library::kernel`](crate::library::Library::kernel).
    /// Here, this wrapper uses the current context for calculations.
    ///
    /// Note:
    ///
    /// Note that this function may also return error codes from previous, asynchronous launches.
    pub fn occupancy_max_potential_block_size_with_flags(
        &self,
        dynamic_shared_memory_bytes: usize,
        block_size_limit: i32,
        flags: OccupancyFlags,
    ) -> Result<OccupancyMaxPotentialBlockSize> {
        self.module.ctx.bind()?;
        let mut min_grid_size = 0;
        let mut block_size = 0;
        unsafe {
            try_cuda!(driver::cuOccupancyMaxPotentialBlockSizeWithFlags(
                &raw mut min_grid_size,
                &raw mut block_size,
                self.handle.as_raw(),
                None,
                dynamic_shared_memory_bytes as _,
                block_size_limit,
                flags.bits(),
            ))?;
        }
        Ok(OccupancyMaxPotentialBlockSize {
            min_grid_size,
            block_size,
        })
    }

    /// Given this kernel function and launch configuration, returns the maximum cluster size.
    ///
    /// The cluster dimensions in `config` are ignored.
    /// If the function has a required cluster size set, the returned value reflects the required cluster size.
    ///
    /// By default this function will always return a value that's portable on future hardware.
    /// A higher value may be returned if the kernel function allows non-portable cluster sizes.
    ///
    /// This function will respect the compile time launch bounds.
    ///
    /// The underlying CUDA API can also be used with a context-less kernel queried via [`Library::kernel`](crate::library::Library::kernel).
    /// Here, this wrapper uses the current context for calculations.
    ///
    /// Note:
    ///
    /// Note that this function may also return error codes from previous, asynchronous launches.
    pub fn occupancy_max_potential_cluster_size(&self, config: ClusterLaunchConfig) -> Result<i32> {
        self.module.ctx.bind()?;
        let mut cluster_size = 0;
        let config = driver::CUlaunchConfig {
            gridDimX: config.grid_dim.x,
            gridDimY: config.grid_dim.y,
            gridDimZ: config.grid_dim.z,
            blockDimX: config.block_dim.x,
            blockDimY: config.block_dim.y,
            blockDimZ: config.block_dim.z,
            sharedMemBytes: config.shared_memory_bytes as _,
            hStream: ptr::null_mut(),
            attrs: ptr::null_mut(),
            numAttrs: 0,
        };
        unsafe {
            try_cuda!(driver::cuOccupancyMaxPotentialClusterSize(
                &raw mut cluster_size,
                self.handle.as_raw(),
                &raw const config,
            ))?;
        }
        Ok(cluster_size)
    }

    /// Given this kernel function and launch configuration, returns the maximum number of clusters that could co-exist on the target device.
    ///
    /// If the function has a required cluster size already set, the cluster size from `config` must either be unspecified or match the required size.
    /// Without required sizes, the cluster size must be specified in `config`, otherwise the function will return an error.
    ///
    /// Note that various attributes of the kernel function may affect occupancy calculation.
    /// Runtime environment may affect how the hardware schedules the clusters, so the calculated occupancy is not guaranteed to be achievable.
    ///
    /// The underlying CUDA API can also be used with a context-less kernel queried via [`Library::kernel`](crate::library::Library::kernel).
    /// Here, this wrapper uses the current context for calculations.
    ///
    /// Note:
    ///
    /// Note that this function may also return error codes from previous, asynchronous launches.
    pub fn occupancy_max_active_clusters(&self, config: ClusterLaunchConfig) -> Result<i32> {
        self.module.ctx.bind()?;
        let mut clusters = 0;
        let config = driver::CUlaunchConfig {
            gridDimX: config.grid_dim.x,
            gridDimY: config.grid_dim.y,
            gridDimZ: config.grid_dim.z,
            blockDimX: config.block_dim.x,
            blockDimY: config.block_dim.y,
            blockDimZ: config.block_dim.z,
            sharedMemBytes: config.shared_memory_bytes as _,
            hStream: ptr::null_mut(),
            attrs: ptr::null_mut(),
            numAttrs: 0,
        };
        unsafe {
            try_cuda!(driver::cuOccupancyMaxActiveClusters(
                &raw mut clusters,
                self.handle.as_raw(),
                &raw const config,
            ))?;
        }
        Ok(clusters)
    }

    pub const unsafe fn as_raw(&self) -> DeviceFunction {
        self.handle
    }
}

impl LaunchConfig {
    pub const fn new(grid_dim: Dim3, block_dim: Dim3, shared_memory_bytes: usize) -> Self {
        Self {
            grid_dim,
            block_dim,
            shared_memory_bytes,
        }
    }

    pub const fn for_1d_grid(element_count: usize, block_size: usize) -> Self {
        let grid_size = element_count.div_ceil(block_size);

        Self::new(
            Dim3::new(grid_size as u32, 1, 1),
            Dim3::new(block_size as u32, 1, 1),
            0,
        )
    }

    pub const fn for_2d_grid(
        width: usize,
        height: usize,
        block_width: usize,
        block_height: usize,
    ) -> Self {
        let grid_x = width.div_ceil(block_width);
        let grid_y = height.div_ceil(block_height);

        Self::new(
            Dim3::new(grid_x as u32, grid_y as u32, 1),
            Dim3::new(block_width as u32, block_height as u32, 1),
            0,
        )
    }

    pub const fn for_3d_grid(
        width: usize,
        height: usize,
        depth: usize,
        block_width: usize,
        block_height: usize,
        block_depth: usize,
    ) -> Self {
        let grid_x = width.div_ceil(block_width);
        let grid_y = height.div_ceil(block_height);
        let grid_z = depth.div_ceil(block_depth);

        Self::new(
            Dim3::new(grid_x as u32, grid_y as u32, grid_z as u32),
            Dim3::new(block_width as u32, block_height as u32, block_depth as u32),
            0,
        )
    }
}

// TODO: Verify and/or fix API to prevent use-after-free
impl KernelParameters {
    pub const fn new() -> Self {
        Self {
            arguments: Vec::new(),
        }
    }

    pub fn arg<T>(&mut self, value: &T) -> &mut Self {
        self.arguments.push(ptr::from_ref(value) as *mut ());
        self
    }

    pub fn arg_mut<T>(&mut self, value: &mut T) -> &mut Self {
        self.arguments.push(ptr::from_mut(value).cast::<()>());
        self
    }

    pub fn as_mut_ptr(&mut self) -> *mut *mut () {
        self.arguments.as_mut_ptr()
    }
}

impl Default for KernelParameters {
    fn default() -> Self {
        Self::new()
    }
}