use std::{
ffi::{CStr, CString},
fmt::{self, Display, Formatter},
mem::{self, MaybeUninit},
};
use num_enum::{IntoPrimitive, TryFromPrimitive};
use singe_core::impl_enum_conversion;
use singe_cuda_sys::runtime;
use crate::{
context::ContextFlags,
error::{Error, Result},
try_cuda,
types::FunctionCache,
};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
#[repr(u32)]
pub enum Limit {
StackSize = runtime::cudaLimit::cudaLimitStackSize as _,
PrintfFifoSize = runtime::cudaLimit::cudaLimitPrintfFifoSize as _,
MallocHeapSize = runtime::cudaLimit::cudaLimitMallocHeapSize as _,
DevRuntimeSyncDepth = runtime::cudaLimit::cudaLimitDevRuntimeSyncDepth as _,
DevRuntimePendingLaunchCount = runtime::cudaLimit::cudaLimitDevRuntimePendingLaunchCount as _,
MaxL2FetchGranularity = runtime::cudaLimit::cudaLimitMaxL2FetchGranularity as _,
PersistingL2CacheSize = runtime::cudaLimit::cudaLimitPersistingL2CacheSize as _,
}
impl_enum_conversion!(runtime::cudaLimit, Limit);
impl Display for Limit {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self {
Self::StackSize => write!(f, "cudaLimitStackSize"),
Self::PrintfFifoSize => write!(f, "cudaLimitPrintfFifoSize"),
Self::MallocHeapSize => write!(f, "cudaLimitMallocHeapSize"),
Self::DevRuntimeSyncDepth => write!(f, "cudaLimitDevRuntimeSyncDepth"),
Self::DevRuntimePendingLaunchCount => {
write!(f, "cudaLimitDevRuntimePendingLaunchCount")
}
Self::MaxL2FetchGranularity => write!(f, "cudaLimitMaxL2FetchGranularity"),
Self::PersistingL2CacheSize => write!(f, "cudaLimitPersistingL2CacheSize"),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
#[repr(u32)]
pub enum ComputeMode {
Default = runtime::cudaComputeMode::cudaComputeModeDefault as _,
Exclusive = runtime::cudaComputeMode::cudaComputeModeExclusive as _,
Prohibited = runtime::cudaComputeMode::cudaComputeModeProhibited as _,
ExclusiveProcess = runtime::cudaComputeMode::cudaComputeModeExclusiveProcess as _,
}
impl_enum_conversion!(runtime::cudaComputeMode, ComputeMode);
bitflags::bitflags! {
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct PeerAccessFlags: u32 {
const DEFAULT = runtime::cudaPeerAccessDefault;
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
#[repr(u32)]
pub enum PeerToPeerAttribute {
PerformanceRank = runtime::cudaDeviceP2PAttr::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK as _,
AccessSupported = runtime::cudaDeviceP2PAttr::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED as _,
NativeAtomicSupported =
runtime::cudaDeviceP2PAttr::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED as _,
CudaArrayAccessSupported =
runtime::cudaDeviceP2PAttr::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED as _,
}
impl_enum_conversion!(runtime::cudaDeviceP2PAttr, PeerToPeerAttribute);
impl Display for PeerToPeerAttribute {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match self {
Self::PerformanceRank => write!(f, "cudaDevP2PAttrPerformanceRank"),
Self::AccessSupported => write!(f, "cudaDevP2PAttrAccessSupported"),
Self::NativeAtomicSupported => {
write!(f, "cudaDevP2PAttrNativeAtomicSupported")
}
Self::CudaArrayAccessSupported => {
write!(f, "cudaDevP2PAttrCudaArrayAccessSupported")
}
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct StreamPriorityRange {
pub least: i32,
pub greatest: i32,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct Uuid {
pub bytes: [u8; 16],
}
impl From<runtime::cudaUUID_t> for Uuid {
fn from(value: runtime::cudaUUID_t) -> Self {
Self {
bytes: value.bytes.map(|byte| byte as u8),
}
}
}
#[derive(Debug, Clone)]
pub struct DeviceProperties {
pub name: String,
pub uuid: Uuid,
pub luid: [u8; 8],
pub luid_device_node_mask: u32,
pub total_global_mem: usize,
pub shared_mem_per_block: usize,
pub regs_per_block: i32,
pub warp_size: i32,
pub mem_pitch: usize,
pub max_threads_per_block: i32,
pub max_threads_dim: [i32; 3],
pub max_grid_size: [i32; 3],
pub total_const_mem: usize,
pub major: i32,
pub minor: i32,
pub texture_alignment: usize,
pub texture_pitch_alignment: usize,
pub multi_processor_count: i32,
pub integrated: bool,
pub can_map_host_memory: bool,
pub max_texture1d: i32,
pub max_texture1d_mipmap: i32,
pub max_texture2d: [i32; 2],
pub max_texture2d_mipmap: [i32; 2],
pub max_texture2d_linear: [i32; 3],
pub max_texture2d_gather: [i32; 2],
pub max_texture3d: [i32; 3],
pub max_texture3d_alt: [i32; 3],
pub max_texture_cubemap: i32,
pub max_texture1d_layered: [i32; 2],
pub max_texture2d_layered: [i32; 3],
pub max_texture_cubemap_layered: [i32; 2],
pub max_surface1d: i32,
pub max_surface2d: [i32; 2],
pub max_surface3d: [i32; 3],
pub max_surface1d_layered: [i32; 2],
pub max_surface2d_layered: [i32; 3],
pub max_surface_cubemap: i32,
pub max_surface_cubemap_layered: [i32; 2],
pub surface_alignment: usize,
pub concurrent_kernels: bool,
pub ecc_enabled: bool,
pub pci_bus_id: i32,
pub pci_device_id: i32,
pub pci_domain_id: i32,
pub tcc_driver: bool,
pub async_engine_count: i32,
pub unified_addressing: bool,
pub memory_bus_width: i32,
pub l2_cache_size: i32,
pub persisting_l2_cache_max_size: i32,
pub max_threads_per_multi_processor: i32,
pub stream_priorities_supported: bool,
pub global_l1_cache_supported: bool,
pub local_l1_cache_supported: bool,
pub shared_mem_per_multiprocessor: usize,
pub regs_per_multiprocessor: i32,
pub managed_memory: bool,
pub is_multi_gpu_board: bool,
pub multi_gpu_board_group_id: i32,
pub host_native_atomic_supported: bool,
pub pageable_memory_access: bool,
pub concurrent_managed_access: bool,
pub compute_preemption_supported: bool,
pub can_use_host_pointer_for_registered_mem: bool,
pub cooperative_launch: bool,
pub shared_mem_per_block_optin: usize,
pub pageable_memory_access_uses_host_page_tables: bool,
pub direct_managed_mem_access_from_host: bool,
pub max_blocks_per_multi_processor: i32,
pub access_policy_max_window_size: i32,
pub reserved_shared_mem_per_block: usize,
pub host_register_supported: bool,
pub sparse_cuda_array_supported: bool,
pub host_register_read_only_supported: bool,
pub timeline_semaphore_interop_supported: bool,
pub memory_pools_supported: bool,
pub gpu_direct_rdma_supported: bool,
pub gpu_direct_rdma_flush_writes_options: u32,
pub gpu_direct_rdma_writes_ordering: i32,
pub memory_pool_supported_handle_types: u32,
pub deferred_mapping_cuda_array_supported: bool,
pub ipc_event_supported: bool,
pub cluster_launch: bool,
pub unified_function_pointers: bool,
}
impl TryFrom<runtime::cudaDeviceProp> for DeviceProperties {
type Error = Error;
fn try_from(value: runtime::cudaDeviceProp) -> Result<Self> {
let end = value
.name
.iter()
.position(|&c| c == 0)
.unwrap_or(value.name.len());
let name_bytes: Vec<u8> = value.name[..end].iter().map(|&byte| byte as u8).collect();
let name = String::from_utf8_lossy(&name_bytes).into_owned();
let prop = Self {
name,
uuid: value.uuid.into(),
luid: value.luid.map(|byte| byte as u8),
luid_device_node_mask: value.luidDeviceNodeMask,
total_global_mem: value.totalGlobalMem as usize,
shared_mem_per_block: value.sharedMemPerBlock as usize,
regs_per_block: value.regsPerBlock,
warp_size: value.warpSize,
mem_pitch: value.memPitch as usize,
max_threads_per_block: value.maxThreadsPerBlock,
max_threads_dim: value.maxThreadsDim,
max_grid_size: value.maxGridSize,
total_const_mem: value.totalConstMem as usize,
major: value.major,
minor: value.minor,
texture_alignment: value.textureAlignment as usize,
texture_pitch_alignment: value.texturePitchAlignment as usize,
multi_processor_count: value.multiProcessorCount,
integrated: value.integrated != 0,
can_map_host_memory: value.canMapHostMemory != 0,
max_texture1d: value.maxTexture1D,
max_texture1d_mipmap: value.maxTexture1DMipmap,
max_texture2d: value.maxTexture2D,
max_texture2d_mipmap: value.maxTexture2DMipmap,
max_texture2d_linear: value.maxTexture2DLinear,
max_texture2d_gather: value.maxTexture2DGather,
max_texture3d: value.maxTexture3D,
max_texture3d_alt: value.maxTexture3DAlt,
max_texture_cubemap: value.maxTextureCubemap,
max_texture1d_layered: value.maxTexture1DLayered,
max_texture2d_layered: value.maxTexture2DLayered,
max_texture_cubemap_layered: value.maxTextureCubemapLayered,
max_surface1d: value.maxSurface1D,
max_surface2d: value.maxSurface2D,
max_surface3d: value.maxSurface3D,
max_surface1d_layered: value.maxSurface1DLayered,
max_surface2d_layered: value.maxSurface2DLayered,
max_surface_cubemap: value.maxSurfaceCubemap,
max_surface_cubemap_layered: value.maxSurfaceCubemapLayered,
surface_alignment: value.surfaceAlignment as usize,
concurrent_kernels: value.concurrentKernels != 0,
ecc_enabled: value.ECCEnabled != 0,
pci_bus_id: value.pciBusID,
pci_device_id: value.pciDeviceID,
pci_domain_id: value.pciDomainID,
tcc_driver: value.tccDriver != 0,
async_engine_count: value.asyncEngineCount,
unified_addressing: value.unifiedAddressing != 0,
memory_bus_width: value.memoryBusWidth,
l2_cache_size: value.l2CacheSize,
persisting_l2_cache_max_size: value.persistingL2CacheMaxSize,
max_threads_per_multi_processor: value.maxThreadsPerMultiProcessor,
stream_priorities_supported: value.streamPrioritiesSupported != 0,
global_l1_cache_supported: value.globalL1CacheSupported != 0,
local_l1_cache_supported: value.localL1CacheSupported != 0,
shared_mem_per_multiprocessor: value.sharedMemPerMultiprocessor as usize,
regs_per_multiprocessor: value.regsPerMultiprocessor,
managed_memory: value.managedMemory != 0,
is_multi_gpu_board: value.isMultiGpuBoard != 0,
multi_gpu_board_group_id: value.multiGpuBoardGroupID,
host_native_atomic_supported: value.hostNativeAtomicSupported != 0,
pageable_memory_access: value.pageableMemoryAccess != 0,
concurrent_managed_access: value.concurrentManagedAccess != 0,
compute_preemption_supported: value.computePreemptionSupported != 0,
can_use_host_pointer_for_registered_mem: value.canUseHostPointerForRegisteredMem != 0,
cooperative_launch: value.cooperativeLaunch != 0,
shared_mem_per_block_optin: value.sharedMemPerBlockOptin as usize,
pageable_memory_access_uses_host_page_tables: value
.pageableMemoryAccessUsesHostPageTables
!= 0,
direct_managed_mem_access_from_host: value.directManagedMemAccessFromHost != 0,
max_blocks_per_multi_processor: value.maxBlocksPerMultiProcessor,
access_policy_max_window_size: value.accessPolicyMaxWindowSize,
reserved_shared_mem_per_block: value.reservedSharedMemPerBlock as usize,
host_register_supported: value.hostRegisterSupported != 0,
sparse_cuda_array_supported: value.sparseCudaArraySupported != 0,
host_register_read_only_supported: value.hostRegisterReadOnlySupported != 0,
timeline_semaphore_interop_supported: value.timelineSemaphoreInteropSupported != 0,
memory_pools_supported: value.memoryPoolsSupported != 0,
gpu_direct_rdma_supported: value.gpuDirectRDMASupported != 0,
gpu_direct_rdma_flush_writes_options: value.gpuDirectRDMAFlushWritesOptions,
gpu_direct_rdma_writes_ordering: value.gpuDirectRDMAWritesOrdering,
memory_pool_supported_handle_types: value.memoryPoolSupportedHandleTypes,
deferred_mapping_cuda_array_supported: value.deferredMappingCudaArraySupported != 0,
ipc_event_supported: value.ipcEventSupported != 0,
cluster_launch: value.clusterLaunch != 0,
unified_function_pointers: value.unifiedFunctionPointers != 0,
};
Ok(prop)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct Device(DeviceId);
pub type DeviceId = i32;
impl Device {
pub const fn new(id: DeviceId) -> Self {
Self(id)
}
pub fn count() -> Result<i32> {
let mut count: i32 = 0;
unsafe {
try_cuda!(runtime::cudaGetDeviceCount(&raw mut count))?;
}
Ok(count)
}
pub fn current() -> Result<Self> {
let mut device_id: i32 = 0;
unsafe {
try_cuda!(runtime::cudaGetDevice(&raw mut device_id))?;
}
Ok(Self(device_id))
}
pub fn synchronize() -> Result<()> {
unsafe {
try_cuda!(runtime::cudaDeviceSynchronize())?;
}
Ok(())
}
pub fn reset() -> Result<()> {
unsafe {
try_cuda!(runtime::cudaDeviceReset())?;
}
Ok(())
}
pub fn limit(limit: Limit) -> Result<usize> {
let mut value = 0;
unsafe {
try_cuda!(runtime::cudaDeviceGetLimit(&raw mut value, limit.into(),))?;
}
Ok(value as _)
}
pub fn set_limit(limit: Limit, value: usize) -> Result<()> {
unsafe {
try_cuda!(runtime::cudaDeviceSetLimit(limit.into(), value as _))?;
}
Ok(())
}
pub fn set_flags(flags: ContextFlags) -> Result<()> {
unsafe { try_cuda!(runtime::cudaSetDeviceFlags(flags.bits())) }
}
pub fn flags() -> Result<ContextFlags> {
let mut flags_raw: u32 = 0;
unsafe {
try_cuda!(runtime::cudaGetDeviceFlags(&raw mut flags_raw))?;
}
Ok(ContextFlags::from_bits_retain(flags_raw))
}
pub fn choose(prop: &DeviceProperties) -> Result<Self> {
let mut ffi_prop: runtime::cudaDeviceProp = unsafe { mem::zeroed() };
ffi_prop.major = prop.major;
ffi_prop.minor = prop.minor;
ffi_prop.managedMemory = i32::from(prop.managed_memory);
let mut device: i32 = -1;
unsafe {
try_cuda!(runtime::cudaChooseDevice(
(&raw mut device).cast(),
&raw const ffi_prop
))?;
}
if device == -1 {
Err(Error::DeviceNotFound)
} else {
Ok(Self(device))
}
}
pub fn by_pci_bus_id(pci_bus_id: &str) -> Result<Self> {
let c_pci_bus_id = CString::new(pci_bus_id)?;
let mut device: i32 = -1;
unsafe {
try_cuda!(runtime::cudaDeviceGetByPCIBusId(
(&raw mut device).cast(),
c_pci_bus_id.as_ptr(),
))?;
}
if device == -1 {
Err(Error::DeviceNotFound)
} else {
Ok(Self(device))
}
}
pub fn set_current(self) -> Result<()> {
unsafe {
try_cuda!(runtime::cudaSetDevice(self.0))?;
}
Ok(())
}
pub fn properties(self) -> Result<DeviceProperties> {
unsafe {
let mut prop = MaybeUninit::<runtime::cudaDeviceProp>::uninit();
try_cuda!(runtime::cudaGetDeviceProperties(prop.as_mut_ptr(), self.0))?;
prop.assume_init().try_into()
}
}
pub fn pci_bus_id(self) -> Result<String> {
const LEN: usize = 16; let mut pci_bus_id_buf = [0i8; LEN];
unsafe {
try_cuda!(runtime::cudaDeviceGetPCIBusId(
pci_bus_id_buf.as_mut_ptr().cast(),
LEN as _,
self.0,
))?;
let c_str = CStr::from_ptr(pci_bus_id_buf.as_ptr().cast());
Ok(c_str.to_string_lossy().into_owned())
}
}
pub fn enable_peer_access(self, flags: PeerAccessFlags) -> Result<()> {
if flags != PeerAccessFlags::DEFAULT {
return Err(Error::InvalidValue);
}
unsafe { try_cuda!(runtime::cudaDeviceEnablePeerAccess(self.0, flags.bits(),)) }
}
pub fn disable_peer_access(self) -> Result<()> {
unsafe { try_cuda!(runtime::cudaDeviceDisablePeerAccess(self.0)) }
}
pub fn can_access_peer(self, other: Self) -> Result<bool> {
let mut can_access_peer: i32 = 0;
unsafe {
try_cuda!(runtime::cudaDeviceCanAccessPeer(
(&raw mut can_access_peer).cast(),
self.0,
other.0,
))?;
}
Ok(can_access_peer != 0)
}
pub fn p2p_attribute(self, attr: PeerToPeerAttribute, other: Self) -> Result<i32> {
let mut value: i32 = 0;
unsafe {
try_cuda!(runtime::cudaDeviceGetP2PAttribute(
(&raw mut value).cast(),
attr.into(),
self.0,
other.0,
))?;
}
Ok(value)
}
pub fn cache_config() -> Result<FunctionCache> {
let mut config = runtime::cudaFuncCache::CU_FUNC_CACHE_PREFER_NONE;
unsafe {
try_cuda!(runtime::cudaDeviceGetCacheConfig(&raw mut config))?;
}
Ok(config.into())
}
pub fn set_cache_config(config: FunctionCache) -> Result<()> {
unsafe {
try_cuda!(runtime::cudaDeviceSetCacheConfig(config.into()))?;
}
Ok(())
}
pub fn stream_priority_range() -> Result<StreamPriorityRange> {
let mut least = 0;
let mut greatest = 0;
unsafe {
try_cuda!(runtime::cudaDeviceGetStreamPriorityRange(
&raw mut least,
&raw mut greatest,
))?;
}
Ok(StreamPriorityRange { least, greatest })
}
pub const fn id(self) -> DeviceId {
self.0
}
}
#[cfg(all(test, feature = "testing"))]
mod tests {
use super::*;
#[test]
fn it_works() {
match Device::count() {
Ok(count) => {
println!("Found {} CUDA devices.", count);
if count > 0 {
match Device::new(0).properties() {
Ok(props) => println!("Device 0: {}", props.name),
Err(e) => eprintln!("error getting properties for device 0: {:?}", e),
}
}
}
Err(e) => eprintln!("error getting device count: {:?}", e),
}
}
}