cuda_oxide/
func.rs

1use crate::*;
2use num_enum::TryFromPrimitive;
3
4/// A [`Function`]-specific attribute type
5#[derive(Debug, Copy, Clone, TryFromPrimitive)]
6#[repr(u32)]
7pub enum FunctionAttribute {
8    /// The maximum number of threads per block, beyond which a launch of the function would fail. This number depends on both the function and the device on which the function is currently loaded.
9    MaxThreadsPerBlock = 0,
10    /// The size in bytes of statically-allocated shared memory required by this function. This does not include dynamically-allocated shared memory requested by the user at runtime.
11    SharedSizeBytes = 1,
12    /// The size in bytes of user-allocated constant memory required by this function.
13    ConstSizeBytes = 2,
14    /// The size in bytes of local memory used by each thread of this function.
15    LocalSizeBytes = 3,
16    /// The number of registers used by each thread of this function.
17    NumRegs = 4,
18    /// The PTX virtual architecture version for which the function was compiled. This value is the major PTX version * 10 + the minor PTX version, so a PTX version 1.3 function would return the value 13. Note that this may return the undefined value of 0 for cubins compiled prior to CUDA 3.0.
19    PtxVersion = 5,
20    /// The binary architecture version for which the function was compiled. This value is the major binary version * 10 + the minor binary version, so a binary version 1.3 function would return the value 13. Note that this will return a value of 10 for legacy cubins that do not have a properly-encoded binary architecture version.
21    BinaryVersion = 6,
22    /// The attribute to indicate whether the function has been compiled with user specified option "-Xptxas --dlcm=ca" set .
23    CacheModeCa = 7,
24    /// The maximum size in bytes of dynamically-allocated shared memory that can be used by this function. If the user-specified dynamic shared memory size is larger than this value, the launch will fail. See cuFuncSetAttribute
25    MaxDynamicSharedSizeBytes = 8,
26    /// On devices where the L1 cache and shared memory use the same hardware resources, this sets the shared memory carveout preference, in percent of the total shared memory. Refer to CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR. This is only a hint, and the driver can choose a different ratio if required to execute the function. See cuFuncSetAttribute
27    PreferredSharedMemoryCarveout = 9,
28}
29
30/// A [`Function`] cache config
31#[derive(Debug, Copy, Clone, TryFromPrimitive)]
32#[repr(u32)]
33pub enum FuncCache {
34    /// no preference for shared memory or L1 (default)
35    PreferNone = 0x00,
36    /// prefer larger shared memory and smaller L1 cache
37    PreferShared = 0x01,
38    /// prefer larger L1 cache and smaller shared memory
39    PreferL1 = 0x02,
40    /// prefer equal sized L1 cache and shared memory
41    PreferEqual = 0x03,
42}
43
44/// A [`Function`] shared memory config
45#[derive(Debug, Copy, Clone, TryFromPrimitive)]
46#[repr(u32)]
47pub enum FuncSharedConfig {
48    /// set default shared memory bank size
49    DefaultBankSize = 0x00,
50    /// set shared memory bank width to four bytes
51    FourByteBankSize = 0x01,
52    /// set shared memory bank width to eight bytes
53    EightByteBankSize = 0x02,
54}
55
56/// Represents an individual callable Kernel loaded from a [`Module`]
57pub struct Function<'a, 'b> {
58    pub(crate) module: &'b Module<'a>,
59    pub(crate) inner: *mut sys::CUfunc_st,
60}
61
62impl<'a, 'b> Function<'a, 'b> {
63    /// Returns a module handle.
64    pub fn module(&self) -> &'b Module<'a> {
65        self.module
66    }
67
68    /// Returns information about a function.
69    pub fn get_attribute(&self, attribute: FunctionAttribute) -> CudaResult<i32> {
70        let mut out = 0i32;
71        cuda_error(unsafe {
72            sys::cuFuncGetAttribute(&mut out as *mut i32, attribute as u32, self.inner)
73        })?;
74        Ok(out)
75    }
76
77    /// Sets information about a function.
78    pub fn set_attribute(&mut self, attribute: FunctionAttribute, value: i32) -> CudaResult<()> {
79        cuda_error(unsafe { sys::cuFuncSetAttribute(self.inner, attribute as u32, value) })
80    }
81
82    /// Sets the preferred cache configuration for a device function.
83    pub fn set_cache_config(&mut self, func_cache: FuncCache) -> CudaResult<()> {
84        cuda_error(unsafe { sys::cuFuncSetCacheConfig(self.inner, func_cache as u32) })
85    }
86
87    /// Sets the shared memory configuration for a device function.
88    pub fn set_shared_mem_config(&mut self, config: FuncSharedConfig) -> CudaResult<()> {
89        cuda_error(unsafe { sys::cuFuncSetSharedMemConfig(self.inner, config as u32) })
90    }
91}