use std::ffi::{c_char, c_int};
use crate::error::CudaResult;
use crate::ffi::{CUdevice, CUdevice_attribute};
use crate::loader::try_driver;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct Device {
raw: CUdevice,
ordinal: i32,
}
impl Device {
pub fn get(ordinal: i32) -> CudaResult<Self> {
let driver = try_driver()?;
let mut raw: CUdevice = 0;
crate::error::check(unsafe { (driver.cu_device_get)(&mut raw, ordinal) })?;
Ok(Self { raw, ordinal })
}
pub fn count() -> CudaResult<i32> {
let driver = try_driver()?;
let mut count: std::ffi::c_int = 0;
crate::error::check(unsafe { (driver.cu_device_get_count)(&mut count) })?;
Ok(count)
}
pub fn name(&self) -> CudaResult<String> {
let driver = try_driver()?;
let mut buf = [0u8; 256];
crate::error::check(unsafe {
(driver.cu_device_get_name)(buf.as_mut_ptr() as *mut c_char, 256, self.raw)
})?;
let len = buf.iter().position(|&b| b == 0).unwrap_or(buf.len());
Ok(String::from_utf8_lossy(&buf[..len]).into_owned())
}
pub fn total_memory(&self) -> CudaResult<usize> {
let driver = try_driver()?;
let mut bytes: usize = 0;
crate::error::check(unsafe { (driver.cu_device_total_mem_v2)(&mut bytes, self.raw) })?;
Ok(bytes)
}
pub fn attribute(&self, attr: CUdevice_attribute) -> CudaResult<i32> {
let driver = try_driver()?;
let mut value: std::ffi::c_int = 0;
crate::error::check(unsafe {
(driver.cu_device_get_attribute)(&mut value, attr, self.raw)
})?;
Ok(value)
}
pub fn compute_capability(&self) -> CudaResult<(i32, i32)> {
let major = self.attribute(CUdevice_attribute::ComputeCapabilityMajor)?;
let minor = self.attribute(CUdevice_attribute::ComputeCapabilityMinor)?;
Ok((major, minor))
}
pub fn max_threads_per_block(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::MaxThreadsPerBlock)
}
pub fn max_block_dim(&self) -> CudaResult<(i32, i32, i32)> {
Ok((
self.attribute(CUdevice_attribute::MaxBlockDimX)?,
self.attribute(CUdevice_attribute::MaxBlockDimY)?,
self.attribute(CUdevice_attribute::MaxBlockDimZ)?,
))
}
pub fn max_grid_dim(&self) -> CudaResult<(i32, i32, i32)> {
Ok((
self.attribute(CUdevice_attribute::MaxGridDimX)?,
self.attribute(CUdevice_attribute::MaxGridDimY)?,
self.attribute(CUdevice_attribute::MaxGridDimZ)?,
))
}
pub fn max_threads_per_multiprocessor(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::MaxThreadsPerMultiprocessor)
}
pub fn max_blocks_per_multiprocessor(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::MaxBlocksPerMultiprocessor)
}
pub fn multiprocessor_count(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::MultiprocessorCount)
}
pub fn warp_size(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::WarpSize)
}
pub fn max_shared_memory_per_block(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::MaxSharedMemoryPerBlock)
}
pub fn max_shared_memory_per_multiprocessor(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::MaxSharedMemoryPerMultiprocessor)
}
pub fn max_shared_memory_per_block_optin(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::MaxSharedMemoryPerBlockOptin)
}
pub fn max_registers_per_block(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::MaxRegistersPerBlock)
}
pub fn max_registers_per_multiprocessor(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::MaxRegistersPerMultiprocessor)
}
pub fn l2_cache_size(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::L2CacheSize)
}
pub fn total_constant_memory(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::TotalConstantMemory)
}
pub fn clock_rate_khz(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::ClockRate)
}
pub fn memory_clock_rate_khz(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::MemoryClockRate)
}
pub fn memory_bus_width(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::GlobalMemoryBusWidth)
}
pub fn pci_bus_id(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::PciBusId)
}
pub fn pci_device_id(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::PciDeviceId)
}
pub fn pci_domain_id(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::PciDomainId)
}
pub fn supports_managed_memory(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::ManagedMemory)? != 0)
}
pub fn supports_concurrent_managed_access(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::ConcurrentManagedAccess)? != 0)
}
pub fn supports_concurrent_kernels(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::ConcurrentKernels)? != 0)
}
pub fn supports_cooperative_launch(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::CooperativeLaunch)? != 0)
}
pub fn ecc_enabled(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::EccEnabled)? != 0)
}
pub fn is_integrated(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::Integrated)? != 0)
}
pub fn can_map_host_memory(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::CanMapHostMemory)? != 0)
}
pub fn supports_unified_addressing(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::UnifiedAddressing)? != 0)
}
pub fn supports_stream_priorities(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::StreamPrioritiesSupported)? != 0)
}
pub fn supports_compute_preemption(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::ComputePreemptionSupported)? != 0)
}
pub fn async_engine_count(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::AsyncEngineCount)
}
pub fn is_multi_gpu_board(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::IsMultiGpuBoard)? != 0)
}
pub fn has_kernel_exec_timeout(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::KernelExecTimeout)? != 0)
}
pub fn compute_mode(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::ComputeMode)
}
pub fn tcc_driver(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::TccDriver)? != 0)
}
pub fn multi_gpu_board_group_id(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::MultiGpuBoardGroupId)
}
pub fn max_persisting_l2_cache_size(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::MaxPersistingL2CacheSize)
}
pub fn supports_generic_compression(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::GenericCompressionSupported)? != 0)
}
pub fn supports_pageable_memory_access(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::PageableMemoryAccess)? != 0)
}
pub fn pageable_memory_uses_host_page_tables(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::PageableMemoryAccessUsesHostPageTables)? != 0)
}
pub fn supports_direct_managed_mem_from_host(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::DirectManagedMemAccessFromHost)? != 0)
}
pub fn memory_pool_supported_handle_types(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::MemoryPoolSupportedHandleTypes)
}
pub fn supports_host_native_atomics(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::HostNativeAtomicSupported)? != 0)
}
pub fn single_to_double_perf_ratio(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::SingleToDoublePrecisionPerfRatio)
}
pub fn supports_cooperative_multi_device_launch(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::CooperativeMultiDeviceLaunch)? != 0)
}
pub fn supports_flush_remote_writes(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::CanFlushRemoteWrites)? != 0)
}
pub fn supports_host_register(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::HostRegisterSupported)? != 0)
}
pub fn can_use_host_pointer_for_registered_mem(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::CanUseHostPointerForRegisteredMem)? != 0)
}
pub fn supports_gpu_direct_rdma(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::GpuDirectRdmaSupported)? != 0)
}
pub fn supports_tensor_map_access(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::TensorMapAccessSupported)? != 0)
}
pub fn supports_multicast(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::MulticastSupported)? != 0)
}
pub fn mps_enabled(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::MpsEnabled)? != 0)
}
pub fn max_texture_1d_width(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::MaxTexture1DWidth)
}
pub fn max_texture_2d_dims(&self) -> CudaResult<(i32, i32)> {
Ok((
self.attribute(CUdevice_attribute::MaxTexture2DWidth)?,
self.attribute(CUdevice_attribute::MaxTexture2DHeight)?,
))
}
pub fn max_texture_3d_dims(&self) -> CudaResult<(i32, i32, i32)> {
Ok((
self.attribute(CUdevice_attribute::MaxTexture3DWidth)?,
self.attribute(CUdevice_attribute::MaxTexture3DHeight)?,
self.attribute(CUdevice_attribute::MaxTexture3DDepth)?,
))
}
pub fn gpu_overlap(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::GpuOverlap)? != 0)
}
pub fn max_pitch(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::MaxPitch)
}
pub fn texture_alignment(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::TextureAlignment)
}
pub fn surface_alignment(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::SurfaceAlignment)
}
pub fn supports_deferred_mapping(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::DeferredMappingCudaArraySupported)? != 0)
}
pub fn supports_memory_pools(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::MemoryPoolsSupported)? != 0)
}
pub fn supports_cluster_launch(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::ClusterLaunch)? != 0)
}
pub fn supports_virtual_memory_management(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::VirtualMemoryManagementSupported)? != 0)
}
pub fn supports_handle_type_posix_fd(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::HandleTypePosixFileDescriptorSupported)? != 0)
}
pub fn supports_handle_type_win32(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::HandleTypeWin32HandleSupported)? != 0)
}
pub fn supports_handle_type_win32_kmt(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::HandleTypeWin32KmtHandleSupported)? != 0)
}
pub fn supports_gpu_direct_rdma_vmm(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::GpuDirectRdmaWithCudaVmmSupported)? != 0)
}
pub fn gpu_direct_rdma_flush_writes_options(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::GpuDirectRdmaFlushWritesOptions)
}
pub fn gpu_direct_rdma_writes_ordering(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::GpuDirectRdmaWritesOrdering)
}
pub fn max_access_policy_window_size(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::MaxAccessPolicyWindowSize)
}
pub fn reserved_shared_memory_per_block(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::ReservedSharedMemoryPerBlock)
}
pub fn supports_timeline_semaphore_interop(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::TimelineSemaphoreInteropSupported)? != 0)
}
pub fn supports_mem_sync_domain(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::MemSyncDomainSupported)? != 0)
}
pub fn mem_sync_domain_count(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::MemSyncDomainCount)
}
pub fn supports_gpu_direct_rdma_fabric(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::GpuDirectRdmaFabricSupported)? != 0)
}
pub fn supports_unified_function_pointers(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::UnifiedFunctionPointers)? != 0)
}
pub fn supports_ipc_events(&self) -> CudaResult<bool> {
Ok(self.attribute(CUdevice_attribute::IpcEventSupported)? != 0)
}
pub fn numa_config(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::NumaConfig)
}
pub fn numa_id(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::NumaId)
}
pub fn host_numa_id(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::HostNumaId)
}
pub fn texture_pitch_alignment(&self) -> CudaResult<i32> {
self.attribute(CUdevice_attribute::TexturePitchAlignment)
}
pub fn info(&self) -> CudaResult<DeviceInfo> {
let name = self.name()?;
let total_memory_bytes = self.total_memory()?;
let (cc_major, cc_minor) = self.compute_capability().unwrap_or((0, 0));
Ok(DeviceInfo {
name,
ordinal: self.ordinal,
compute_capability: (cc_major, cc_minor),
total_memory_bytes,
multiprocessor_count: self.multiprocessor_count().unwrap_or(0),
max_threads_per_block: self.max_threads_per_block().unwrap_or(0),
max_threads_per_sm: self.max_threads_per_multiprocessor().unwrap_or(0),
warp_size: self.warp_size().unwrap_or(0),
clock_rate_mhz: self.clock_rate_khz().unwrap_or(0) as f64 / 1000.0,
memory_clock_rate_mhz: self.memory_clock_rate_khz().unwrap_or(0) as f64 / 1000.0,
memory_bus_width_bits: self.memory_bus_width().unwrap_or(0),
l2_cache_bytes: self.l2_cache_size().unwrap_or(0),
max_shared_memory_per_block: self.max_shared_memory_per_block().unwrap_or(0),
max_shared_memory_per_sm: self.max_shared_memory_per_multiprocessor().unwrap_or(0),
max_registers_per_block: self.max_registers_per_block().unwrap_or(0),
ecc_enabled: self.ecc_enabled().unwrap_or(false),
tcc_driver: self.tcc_driver().unwrap_or(false),
compute_mode: self.compute_mode().unwrap_or(0),
supports_cooperative_launch: self.supports_cooperative_launch().unwrap_or(false),
supports_managed_memory: self.supports_managed_memory().unwrap_or(false),
max_persisting_l2_cache_bytes: self.max_persisting_l2_cache_size().unwrap_or(0),
async_engine_count: self.async_engine_count().unwrap_or(0),
supports_memory_pools: self.supports_memory_pools().unwrap_or(false),
supports_gpu_direct_rdma: self.supports_gpu_direct_rdma().unwrap_or(false),
supports_cluster_launch: self.supports_cluster_launch().unwrap_or(false),
supports_concurrent_kernels: self.supports_concurrent_kernels().unwrap_or(false),
supports_unified_addressing: self.supports_unified_addressing().unwrap_or(false),
max_blocks_per_sm: self.max_blocks_per_multiprocessor().unwrap_or(0),
single_to_double_perf_ratio: self.single_to_double_perf_ratio().unwrap_or(0),
})
}
#[inline]
pub fn raw(&self) -> CUdevice {
self.raw
}
#[inline]
pub fn ordinal(&self) -> i32 {
self.ordinal
}
}
impl std::fmt::Display for Device {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Device({})", self.ordinal)
}
}
#[derive(Debug, Clone)]
pub struct DeviceInfo {
pub name: String,
pub ordinal: i32,
pub compute_capability: (i32, i32),
pub total_memory_bytes: usize,
pub multiprocessor_count: i32,
pub max_threads_per_block: i32,
pub max_threads_per_sm: i32,
pub warp_size: i32,
pub clock_rate_mhz: f64,
pub memory_clock_rate_mhz: f64,
pub memory_bus_width_bits: i32,
pub l2_cache_bytes: i32,
pub max_shared_memory_per_block: i32,
pub max_shared_memory_per_sm: i32,
pub max_registers_per_block: i32,
pub ecc_enabled: bool,
pub tcc_driver: bool,
pub compute_mode: i32,
pub supports_cooperative_launch: bool,
pub supports_managed_memory: bool,
pub max_persisting_l2_cache_bytes: i32,
pub async_engine_count: i32,
pub supports_memory_pools: bool,
pub supports_gpu_direct_rdma: bool,
pub supports_cluster_launch: bool,
pub supports_concurrent_kernels: bool,
pub supports_unified_addressing: bool,
pub max_blocks_per_sm: i32,
pub single_to_double_perf_ratio: i32,
}
impl std::fmt::Display for DeviceInfo {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let mem_mb = self.total_memory_bytes / (1024 * 1024);
let (major, minor) = self.compute_capability;
writeln!(f, "Device {}: {}", self.ordinal, self.name)?;
writeln!(f, " Compute capability : {major}.{minor}")?;
writeln!(f, " Total memory : {mem_mb} MB")?;
writeln!(f, " SMs : {}", self.multiprocessor_count)?;
writeln!(f, " Max threads/block : {}", self.max_threads_per_block)?;
writeln!(f, " Max threads/SM : {}", self.max_threads_per_sm)?;
writeln!(f, " Warp size : {}", self.warp_size)?;
writeln!(f, " Core clock : {:.1} MHz", self.clock_rate_mhz)?;
writeln!(
f,
" Memory clock : {:.1} MHz",
self.memory_clock_rate_mhz
)?;
writeln!(
f,
" Memory bus : {} bits",
self.memory_bus_width_bits
)?;
writeln!(
f,
" L2 cache : {} KB",
self.l2_cache_bytes / 1024
)?;
writeln!(
f,
" Shared mem/block : {} KB",
self.max_shared_memory_per_block / 1024
)?;
writeln!(
f,
" Shared mem/SM : {} KB",
self.max_shared_memory_per_sm / 1024
)?;
writeln!(f, " Registers/block : {}", self.max_registers_per_block)?;
writeln!(f, " ECC : {}", self.ecc_enabled)?;
writeln!(f, " TCC driver : {}", self.tcc_driver)?;
writeln!(f, " Compute mode : {}", self.compute_mode)?;
writeln!(
f,
" Cooperative launch : {}",
self.supports_cooperative_launch
)?;
writeln!(f, " Managed memory : {}", self.supports_managed_memory)?;
writeln!(
f,
" Persist L2 cache : {} KB",
self.max_persisting_l2_cache_bytes / 1024
)?;
writeln!(f, " Async engines : {}", self.async_engine_count)?;
writeln!(f, " Memory pools : {}", self.supports_memory_pools)?;
writeln!(
f,
" GPU Direct RDMA : {}",
self.supports_gpu_direct_rdma
)?;
writeln!(f, " Cluster launch : {}", self.supports_cluster_launch)?;
writeln!(
f,
" Concurrent kernels : {}",
self.supports_concurrent_kernels
)?;
writeln!(
f,
" Unified addressing : {}",
self.supports_unified_addressing
)?;
writeln!(f, " Max blocks/SM : {}", self.max_blocks_per_sm)?;
write!(
f,
" FP32/FP64 ratio : {}",
self.single_to_double_perf_ratio
)
}
}
pub fn list_devices() -> CudaResult<Vec<Device>> {
let count = Device::count()?;
let mut devices = Vec::with_capacity(count as usize);
for i in 0..count {
devices.push(Device::get(i)?);
}
Ok(devices)
}
pub fn driver_version() -> CudaResult<i32> {
let driver = try_driver()?;
let mut version: c_int = 0;
crate::error::check(unsafe { (driver.cu_driver_get_version)(&mut version) })?;
Ok(version)
}
pub fn can_access_peer(device: &Device, peer: &Device) -> CudaResult<bool> {
let driver = try_driver()?;
let mut can_access: c_int = 0;
crate::error::check(unsafe {
(driver.cu_device_can_access_peer)(&mut can_access, device.raw(), peer.raw())
})?;
Ok(can_access != 0)
}
pub fn best_device() -> CudaResult<Option<Device>> {
let devices = list_devices()?;
if devices.is_empty() {
return Ok(None);
}
let mut best = devices[0];
let mut best_mem = best.total_memory()?;
for dev in devices.iter().skip(1) {
let mem = dev.total_memory()?;
if mem > best_mem {
best = *dev;
best_mem = mem;
}
}
Ok(Some(best))
}