cuda_oxide/func.rs
1use crate::*;
2use num_enum::TryFromPrimitive;
3
4/// A [`Function`]-specific attribute type
5#[derive(Debug, Copy, Clone, TryFromPrimitive)]
6#[repr(u32)]
7pub enum FunctionAttribute {
8 /// The maximum number of threads per block, beyond which a launch of the function would fail. This number depends on both the function and the device on which the function is currently loaded.
9 MaxThreadsPerBlock = 0,
10 /// The size in bytes of statically-allocated shared memory required by this function. This does not include dynamically-allocated shared memory requested by the user at runtime.
11 SharedSizeBytes = 1,
12 /// The size in bytes of user-allocated constant memory required by this function.
13 ConstSizeBytes = 2,
14 /// The size in bytes of local memory used by each thread of this function.
15 LocalSizeBytes = 3,
16 /// The number of registers used by each thread of this function.
17 NumRegs = 4,
18 /// The PTX virtual architecture version for which the function was compiled. This value is the major PTX version * 10 + the minor PTX version, so a PTX version 1.3 function would return the value 13. Note that this may return the undefined value of 0 for cubins compiled prior to CUDA 3.0.
19 PtxVersion = 5,
20 /// The binary architecture version for which the function was compiled. This value is the major binary version * 10 + the minor binary version, so a binary version 1.3 function would return the value 13. Note that this will return a value of 10 for legacy cubins that do not have a properly-encoded binary architecture version.
21 BinaryVersion = 6,
22 /// The attribute to indicate whether the function has been compiled with user specified option "-Xptxas --dlcm=ca" set .
23 CacheModeCa = 7,
24 /// The maximum size in bytes of dynamically-allocated shared memory that can be used by this function. If the user-specified dynamic shared memory size is larger than this value, the launch will fail. See cuFuncSetAttribute
25 MaxDynamicSharedSizeBytes = 8,
26 /// On devices where the L1 cache and shared memory use the same hardware resources, this sets the shared memory carveout preference, in percent of the total shared memory. Refer to CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR. This is only a hint, and the driver can choose a different ratio if required to execute the function. See cuFuncSetAttribute
27 PreferredSharedMemoryCarveout = 9,
28}
29
30/// A [`Function`] cache config
31#[derive(Debug, Copy, Clone, TryFromPrimitive)]
32#[repr(u32)]
33pub enum FuncCache {
34 /// no preference for shared memory or L1 (default)
35 PreferNone = 0x00,
36 /// prefer larger shared memory and smaller L1 cache
37 PreferShared = 0x01,
38 /// prefer larger L1 cache and smaller shared memory
39 PreferL1 = 0x02,
40 /// prefer equal sized L1 cache and shared memory
41 PreferEqual = 0x03,
42}
43
44/// A [`Function`] shared memory config
45#[derive(Debug, Copy, Clone, TryFromPrimitive)]
46#[repr(u32)]
47pub enum FuncSharedConfig {
48 /// set default shared memory bank size
49 DefaultBankSize = 0x00,
50 /// set shared memory bank width to four bytes
51 FourByteBankSize = 0x01,
52 /// set shared memory bank width to eight bytes
53 EightByteBankSize = 0x02,
54}
55
56/// Represents an individual callable Kernel loaded from a [`Module`]
57pub struct Function<'a, 'b> {
58 pub(crate) module: &'b Module<'a>,
59 pub(crate) inner: *mut sys::CUfunc_st,
60}
61
62impl<'a, 'b> Function<'a, 'b> {
63 /// Returns a module handle.
64 pub fn module(&self) -> &'b Module<'a> {
65 self.module
66 }
67
68 /// Returns information about a function.
69 pub fn get_attribute(&self, attribute: FunctionAttribute) -> CudaResult<i32> {
70 let mut out = 0i32;
71 cuda_error(unsafe {
72 sys::cuFuncGetAttribute(&mut out as *mut i32, attribute as u32, self.inner)
73 })?;
74 Ok(out)
75 }
76
77 /// Sets information about a function.
78 pub fn set_attribute(&mut self, attribute: FunctionAttribute, value: i32) -> CudaResult<()> {
79 cuda_error(unsafe { sys::cuFuncSetAttribute(self.inner, attribute as u32, value) })
80 }
81
82 /// Sets the preferred cache configuration for a device function.
83 pub fn set_cache_config(&mut self, func_cache: FuncCache) -> CudaResult<()> {
84 cuda_error(unsafe { sys::cuFuncSetCacheConfig(self.inner, func_cache as u32) })
85 }
86
87 /// Sets the shared memory configuration for a device function.
88 pub fn set_shared_mem_config(&mut self, config: FuncSharedConfig) -> CudaResult<()> {
89 cuda_error(unsafe { sys::cuFuncSetSharedMemConfig(self.inner, config as u32) })
90 }
91}