1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
use crate::*; use num_enum::TryFromPrimitive; /// A [`Function`]-specific attribute type #[derive(Debug, Copy, Clone, TryFromPrimitive)] #[repr(u32)] pub enum FunctionAttribute { /// The maximum number of threads per block, beyond which a launch of the function would fail. This number depends on both the function and the device on which the function is currently loaded. MaxThreadsPerBlock = 0, /// The size in bytes of statically-allocated shared memory required by this function. This does not include dynamically-allocated shared memory requested by the user at runtime. SharedSizeBytes = 1, /// The size in bytes of user-allocated constant memory required by this function. ConstSizeBytes = 2, /// The size in bytes of local memory used by each thread of this function. LocalSizeBytes = 3, /// The number of registers used by each thread of this function. NumRegs = 4, /// The PTX virtual architecture version for which the function was compiled. This value is the major PTX version * 10 + the minor PTX version, so a PTX version 1.3 function would return the value 13. Note that this may return the undefined value of 0 for cubins compiled prior to CUDA 3.0. PtxVersion = 5, /// The binary architecture version for which the function was compiled. This value is the major binary version * 10 + the minor binary version, so a binary version 1.3 function would return the value 13. Note that this will return a value of 10 for legacy cubins that do not have a properly-encoded binary architecture version. BinaryVersion = 6, /// The attribute to indicate whether the function has been compiled with user specified option "-Xptxas --dlcm=ca" set . CacheModeCa = 7, /// The maximum size in bytes of dynamically-allocated shared memory that can be used by this function. If the user-specified dynamic shared memory size is larger than this value, the launch will fail. See cuFuncSetAttribute MaxDynamicSharedSizeBytes = 8, /// On devices where the L1 cache and shared memory use the same hardware resources, this sets the shared memory carveout preference, in percent of the total shared memory. Refer to CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR. This is only a hint, and the driver can choose a different ratio if required to execute the function. See cuFuncSetAttribute PreferredSharedMemoryCarveout = 9, } /// A [`Function`] cache config #[derive(Debug, Copy, Clone, TryFromPrimitive)] #[repr(u32)] pub enum FuncCache { /// no preference for shared memory or L1 (default) PreferNone = 0x00, /// prefer larger shared memory and smaller L1 cache PreferShared = 0x01, /// prefer larger L1 cache and smaller shared memory PreferL1 = 0x02, /// prefer equal sized L1 cache and shared memory PreferEqual = 0x03, } /// A [`Function`] shared memory config #[derive(Debug, Copy, Clone, TryFromPrimitive)] #[repr(u32)] pub enum FuncSharedConfig { /// set default shared memory bank size DefaultBankSize = 0x00, /// set shared memory bank width to four bytes FourByteBankSize = 0x01, /// set shared memory bank width to eight bytes EightByteBankSize = 0x02, } /// Represents an individual callable Kernel loaded from a [`Module`] pub struct Function<'a, 'b> { pub(crate) module: &'b Module<'a>, pub(crate) inner: *mut sys::CUfunc_st, } impl<'a, 'b> Function<'a, 'b> { /// Returns a module handle. pub fn module(&self) -> &'b Module<'a> { self.module } /// Returns information about a function. pub fn get_attribute(&self, attribute: FunctionAttribute) -> CudaResult<i32> { let mut out = 0i32; cuda_error(unsafe { sys::cuFuncGetAttribute(&mut out as *mut i32, attribute as u32, self.inner) })?; Ok(out) } /// Sets information about a function. pub fn set_attribute(&mut self, attribute: FunctionAttribute, value: i32) -> CudaResult<()> { cuda_error(unsafe { sys::cuFuncSetAttribute(self.inner, attribute as u32, value) }) } /// Sets the preferred cache configuration for a device function. pub fn set_cache_config(&mut self, func_cache: FuncCache) -> CudaResult<()> { cuda_error(unsafe { sys::cuFuncSetCacheConfig(self.inner, func_cache as u32) }) } /// Sets the shared memory configuration for a device function. pub fn set_shared_mem_config(&mut self, config: FuncSharedConfig) -> CudaResult<()> { cuda_error(unsafe { sys::cuFuncSetSharedMemConfig(self.inner, config as u32) }) } }