Skip to main content

oxicuda_driver/
device.rs

1//! CUDA device enumeration and attribute queries.
2//!
3//! This module provides the [`Device`] type, which wraps a `CUdevice` handle
4//! obtained from the CUDA driver. Devices are identified by ordinal (0, 1, 2,
5//! ...) and expose a rich set of convenience methods for querying hardware
6//! capabilities such as compute capability, memory size, multiprocessor count,
7//! and various limits.
8//!
9//! # Examples
10//!
11//! ```no_run
12//! use oxicuda_driver::device::{Device, list_devices};
13//!
14//! oxicuda_driver::init()?;
15//! for dev in list_devices()? {
16//!     println!("{}: compute {}.{}",
17//!         dev.name()?,
18//!         dev.compute_capability()?.0,
19//!         dev.compute_capability()?.1,
20//!     );
21//! }
22//! # Ok::<(), oxicuda_driver::error::CudaError>(())
23//! ```
24
25use std::ffi::{c_char, c_int};
26
27use crate::error::CudaResult;
28use crate::ffi::{CUdevice, CUdevice_attribute};
29use crate::loader::try_driver;
30
31// ---------------------------------------------------------------------------
32// Device
33// ---------------------------------------------------------------------------
34
35/// Represents a CUDA-capable GPU device.
36///
37/// Wraps a `CUdevice` handle obtained from the driver API. Devices are
38/// identified by a zero-based ordinal index. The handle is a lightweight
39/// integer that can be freely copied.
40///
41/// # Examples
42///
43/// ```no_run
44/// use oxicuda_driver::device::Device;
45///
46/// oxicuda_driver::init()?;
47/// let device = Device::get(0)?;
48/// println!("GPU: {}", device.name()?);
49/// println!("Memory: {} MB", device.total_memory()? / (1024 * 1024));
50/// let (major, minor) = device.compute_capability()?;
51/// println!("Compute: {major}.{minor}");
52/// # Ok::<(), oxicuda_driver::error::CudaError>(())
53/// ```
54#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
55pub struct Device {
56    /// Raw CUDA device handle (integer ordinal internally).
57    raw: CUdevice,
58    /// The ordinal used to obtain this device.
59    ordinal: i32,
60}
61
62impl Device {
63    // -- Construction --------------------------------------------------------
64
65    /// Get a device handle by ordinal (0-indexed).
66    ///
67    /// # Errors
68    ///
69    /// Returns [`CudaError::InvalidDevice`](crate::error::CudaError::InvalidDevice) if the ordinal is out of range,
70    /// or [`CudaError::NotInitialized`](crate::error::CudaError::NotInitialized) if the driver has not been loaded.
71    pub fn get(ordinal: i32) -> CudaResult<Self> {
72        let driver = try_driver()?;
73        let mut raw: CUdevice = 0;
74        crate::error::check(unsafe { (driver.cu_device_get)(&mut raw, ordinal) })?;
75        Ok(Self { raw, ordinal })
76    }
77
78    /// Get the number of CUDA-capable devices in the system.
79    ///
80    /// # Errors
81    ///
82    /// Returns an error if the driver cannot enumerate devices.
83    pub fn count() -> CudaResult<i32> {
84        let driver = try_driver()?;
85        let mut count: std::ffi::c_int = 0;
86        crate::error::check(unsafe { (driver.cu_device_get_count)(&mut count) })?;
87        Ok(count)
88    }
89
90    // -- Identity ------------------------------------------------------------
91
92    /// Get the device name (e.g., `"NVIDIA A100-SXM4-80GB"`).
93    ///
94    /// The returned string is an ASCII identifier provided by the driver.
95    ///
96    /// # Errors
97    ///
98    /// Returns an error if the driver call fails.
99    pub fn name(&self) -> CudaResult<String> {
100        let driver = try_driver()?;
101        let mut buf = [0u8; 256];
102        crate::error::check(unsafe {
103            (driver.cu_device_get_name)(buf.as_mut_ptr() as *mut c_char, 256, self.raw)
104        })?;
105        let len = buf.iter().position(|&b| b == 0).unwrap_or(buf.len());
106        Ok(String::from_utf8_lossy(&buf[..len]).into_owned())
107    }
108
109    /// Get total device memory in bytes.
110    ///
111    /// # Errors
112    ///
113    /// Returns an error if the driver call fails.
114    pub fn total_memory(&self) -> CudaResult<usize> {
115        let driver = try_driver()?;
116        let mut bytes: usize = 0;
117        crate::error::check(unsafe { (driver.cu_device_total_mem_v2)(&mut bytes, self.raw) })?;
118        Ok(bytes)
119    }
120
121    // -- Generic attribute query --------------------------------------------
122
123    /// Query an arbitrary device attribute.
124    ///
125    /// This is the low-level building block for all the convenience methods
126    /// below. Callers can use any [`CUdevice_attribute`] variant directly.
127    ///
128    /// # Errors
129    ///
130    /// Returns an error if the attribute is not supported or the driver call
131    /// fails.
132    pub fn attribute(&self, attr: CUdevice_attribute) -> CudaResult<i32> {
133        let driver = try_driver()?;
134        let mut value: std::ffi::c_int = 0;
135        crate::error::check(unsafe {
136            (driver.cu_device_get_attribute)(&mut value, attr, self.raw)
137        })?;
138        Ok(value)
139    }
140
141    // -- Compute capability -------------------------------------------------
142
143    /// Get compute capability as `(major, minor)`.
144    ///
145    /// For example, an A100 returns `(8, 0)` and an RTX 4090 returns `(8, 9)`.
146    ///
147    /// # Errors
148    ///
149    /// Returns an error if the driver call fails.
150    pub fn compute_capability(&self) -> CudaResult<(i32, i32)> {
151        let major = self.attribute(CUdevice_attribute::ComputeCapabilityMajor)?;
152        let minor = self.attribute(CUdevice_attribute::ComputeCapabilityMinor)?;
153        Ok((major, minor))
154    }
155
156    // -- Thread / block / grid limits ---------------------------------------
157
158    /// Get the maximum number of threads per block.
159    pub fn max_threads_per_block(&self) -> CudaResult<i32> {
160        self.attribute(CUdevice_attribute::MaxThreadsPerBlock)
161    }
162
163    /// Get the maximum block dimensions as `(x, y, z)`.
164    pub fn max_block_dim(&self) -> CudaResult<(i32, i32, i32)> {
165        Ok((
166            self.attribute(CUdevice_attribute::MaxBlockDimX)?,
167            self.attribute(CUdevice_attribute::MaxBlockDimY)?,
168            self.attribute(CUdevice_attribute::MaxBlockDimZ)?,
169        ))
170    }
171
172    /// Get the maximum grid dimensions as `(x, y, z)`.
173    pub fn max_grid_dim(&self) -> CudaResult<(i32, i32, i32)> {
174        Ok((
175            self.attribute(CUdevice_attribute::MaxGridDimX)?,
176            self.attribute(CUdevice_attribute::MaxGridDimY)?,
177            self.attribute(CUdevice_attribute::MaxGridDimZ)?,
178        ))
179    }
180
181    /// Get the maximum number of threads per multiprocessor.
182    pub fn max_threads_per_multiprocessor(&self) -> CudaResult<i32> {
183        self.attribute(CUdevice_attribute::MaxThreadsPerMultiprocessor)
184    }
185
186    /// Get the maximum number of blocks per multiprocessor.
187    pub fn max_blocks_per_multiprocessor(&self) -> CudaResult<i32> {
188        self.attribute(CUdevice_attribute::MaxBlocksPerMultiprocessor)
189    }
190
191    // -- Multiprocessor / warp ----------------------------------------------
192
193    /// Get the number of streaming multiprocessors (SMs) on the device.
194    pub fn multiprocessor_count(&self) -> CudaResult<i32> {
195        self.attribute(CUdevice_attribute::MultiprocessorCount)
196    }
197
198    /// Get the warp size in threads (typically 32 for all NVIDIA GPUs).
199    pub fn warp_size(&self) -> CudaResult<i32> {
200        self.attribute(CUdevice_attribute::WarpSize)
201    }
202
203    // -- Memory hierarchy ---------------------------------------------------
204
205    /// Get the maximum shared memory per block in bytes.
206    pub fn max_shared_memory_per_block(&self) -> CudaResult<i32> {
207        self.attribute(CUdevice_attribute::MaxSharedMemoryPerBlock)
208    }
209
210    /// Get the maximum shared memory per multiprocessor in bytes.
211    pub fn max_shared_memory_per_multiprocessor(&self) -> CudaResult<i32> {
212        self.attribute(CUdevice_attribute::MaxSharedMemoryPerMultiprocessor)
213    }
214
215    /// Get the maximum opt-in shared memory per block in bytes.
216    ///
217    /// This is the upper bound achievable via
218    /// `cuFuncSetAttribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES)`.
219    pub fn max_shared_memory_per_block_optin(&self) -> CudaResult<i32> {
220        self.attribute(CUdevice_attribute::MaxSharedMemoryPerBlockOptin)
221    }
222
223    /// Get the maximum number of 32-bit registers per block.
224    pub fn max_registers_per_block(&self) -> CudaResult<i32> {
225        self.attribute(CUdevice_attribute::MaxRegistersPerBlock)
226    }
227
228    /// Get the maximum number of 32-bit registers per multiprocessor.
229    pub fn max_registers_per_multiprocessor(&self) -> CudaResult<i32> {
230        self.attribute(CUdevice_attribute::MaxRegistersPerMultiprocessor)
231    }
232
233    /// Get the L2 cache size in bytes.
234    pub fn l2_cache_size(&self) -> CudaResult<i32> {
235        self.attribute(CUdevice_attribute::L2CacheSize)
236    }
237
238    /// Get the total constant memory on the device in bytes.
239    pub fn total_constant_memory(&self) -> CudaResult<i32> {
240        self.attribute(CUdevice_attribute::TotalConstantMemory)
241    }
242
243    // -- Clock / bus --------------------------------------------------------
244
245    /// Get the core clock rate in kHz.
246    pub fn clock_rate_khz(&self) -> CudaResult<i32> {
247        self.attribute(CUdevice_attribute::ClockRate)
248    }
249
250    /// Get the memory clock rate in kHz.
251    pub fn memory_clock_rate_khz(&self) -> CudaResult<i32> {
252        self.attribute(CUdevice_attribute::MemoryClockRate)
253    }
254
255    /// Get the global memory bus width in bits.
256    pub fn memory_bus_width(&self) -> CudaResult<i32> {
257        self.attribute(CUdevice_attribute::GlobalMemoryBusWidth)
258    }
259
260    // -- PCI topology -------------------------------------------------------
261
262    /// Get the PCI bus ID of the device.
263    pub fn pci_bus_id(&self) -> CudaResult<i32> {
264        self.attribute(CUdevice_attribute::PciBusId)
265    }
266
267    /// Get the PCI device ID.
268    pub fn pci_device_id(&self) -> CudaResult<i32> {
269        self.attribute(CUdevice_attribute::PciDeviceId)
270    }
271
272    /// Get the PCI domain ID.
273    pub fn pci_domain_id(&self) -> CudaResult<i32> {
274        self.attribute(CUdevice_attribute::PciDomainId)
275    }
276
277    // -- Feature / capability flags -----------------------------------------
278
279    /// Check if the device supports managed (unified) memory.
280    pub fn supports_managed_memory(&self) -> CudaResult<bool> {
281        Ok(self.attribute(CUdevice_attribute::ManagedMemory)? != 0)
282    }
283
284    /// Check if the device supports concurrent managed memory access.
285    pub fn supports_concurrent_managed_access(&self) -> CudaResult<bool> {
286        Ok(self.attribute(CUdevice_attribute::ConcurrentManagedAccess)? != 0)
287    }
288
289    /// Check if the device supports concurrent kernel execution.
290    pub fn supports_concurrent_kernels(&self) -> CudaResult<bool> {
291        Ok(self.attribute(CUdevice_attribute::ConcurrentKernels)? != 0)
292    }
293
294    /// Check if the device supports cooperative kernel launches.
295    pub fn supports_cooperative_launch(&self) -> CudaResult<bool> {
296        Ok(self.attribute(CUdevice_attribute::CooperativeLaunch)? != 0)
297    }
298
299    /// Check if ECC memory is enabled on the device.
300    pub fn ecc_enabled(&self) -> CudaResult<bool> {
301        Ok(self.attribute(CUdevice_attribute::EccEnabled)? != 0)
302    }
303
304    /// Check if the device is integrated (shares memory with the host).
305    pub fn is_integrated(&self) -> CudaResult<bool> {
306        Ok(self.attribute(CUdevice_attribute::Integrated)? != 0)
307    }
308
309    /// Check if the device can map host memory into its address space.
310    pub fn can_map_host_memory(&self) -> CudaResult<bool> {
311        Ok(self.attribute(CUdevice_attribute::CanMapHostMemory)? != 0)
312    }
313
314    /// Check if the device uses a unified address space with the host.
315    pub fn supports_unified_addressing(&self) -> CudaResult<bool> {
316        Ok(self.attribute(CUdevice_attribute::UnifiedAddressing)? != 0)
317    }
318
319    /// Check if the device supports stream priorities.
320    pub fn supports_stream_priorities(&self) -> CudaResult<bool> {
321        Ok(self.attribute(CUdevice_attribute::StreamPrioritiesSupported)? != 0)
322    }
323
324    /// Check if the device supports compute preemption.
325    pub fn supports_compute_preemption(&self) -> CudaResult<bool> {
326        Ok(self.attribute(CUdevice_attribute::ComputePreemptionSupported)? != 0)
327    }
328
329    /// Get the number of asynchronous engines (copy engines).
330    pub fn async_engine_count(&self) -> CudaResult<i32> {
331        self.attribute(CUdevice_attribute::AsyncEngineCount)
332    }
333
334    /// Check if the device is on a multi-GPU board.
335    pub fn is_multi_gpu_board(&self) -> CudaResult<bool> {
336        Ok(self.attribute(CUdevice_attribute::IsMultiGpuBoard)? != 0)
337    }
338
339    /// Check if there is a kernel execution timeout enforced by the OS.
340    pub fn has_kernel_exec_timeout(&self) -> CudaResult<bool> {
341        Ok(self.attribute(CUdevice_attribute::KernelExecTimeout)? != 0)
342    }
343
344    // -- Compute mode / driver model ----------------------------------------
345
346    /// Get the compute mode (0=default, 1=exclusive-thread, 2=prohibited,
347    /// 3=exclusive-process).
348    pub fn compute_mode(&self) -> CudaResult<i32> {
349        self.attribute(CUdevice_attribute::ComputeMode)
350    }
351
352    /// Check if the device uses the TCC (Tesla Compute Cluster) driver model.
353    ///
354    /// TCC mode disables the display driver, giving full GPU resources to
355    /// compute workloads.
356    pub fn tcc_driver(&self) -> CudaResult<bool> {
357        Ok(self.attribute(CUdevice_attribute::TccDriver)? != 0)
358    }
359
360    /// Get the multi-GPU board group identifier.
361    ///
362    /// Devices on the same board share the same group ID.
363    pub fn multi_gpu_board_group_id(&self) -> CudaResult<i32> {
364        self.attribute(CUdevice_attribute::MultiGpuBoardGroupId)
365    }
366
367    // -- Memory features (extended) -----------------------------------------
368
369    /// Get the maximum persisting L2 cache size in bytes (Ampere+).
370    pub fn max_persisting_l2_cache_size(&self) -> CudaResult<i32> {
371        self.attribute(CUdevice_attribute::MaxPersistingL2CacheSize)
372    }
373
374    /// Check if the device supports generic memory compression.
375    pub fn supports_generic_compression(&self) -> CudaResult<bool> {
376        Ok(self.attribute(CUdevice_attribute::GenericCompressionSupported)? != 0)
377    }
378
379    /// Check if the device supports pageable memory access.
380    pub fn supports_pageable_memory_access(&self) -> CudaResult<bool> {
381        Ok(self.attribute(CUdevice_attribute::PageableMemoryAccess)? != 0)
382    }
383
384    /// Check if pageable memory access uses host page tables.
385    pub fn pageable_memory_uses_host_page_tables(&self) -> CudaResult<bool> {
386        Ok(self.attribute(CUdevice_attribute::PageableMemoryAccessUsesHostPageTables)? != 0)
387    }
388
389    /// Check if the device supports direct managed memory access from the host.
390    pub fn supports_direct_managed_mem_from_host(&self) -> CudaResult<bool> {
391        Ok(self.attribute(CUdevice_attribute::DirectManagedMemAccessFromHost)? != 0)
392    }
393
394    /// Get memory pool supported handle types as a bitmask.
395    pub fn memory_pool_supported_handle_types(&self) -> CudaResult<i32> {
396        self.attribute(CUdevice_attribute::MemoryPoolSupportedHandleTypes)
397    }
398
399    // -- Advanced features --------------------------------------------------
400
401    /// Check if the device supports host-visible native atomic operations.
402    pub fn supports_host_native_atomics(&self) -> CudaResult<bool> {
403        Ok(self.attribute(CUdevice_attribute::HostNativeAtomicSupported)? != 0)
404    }
405
406    /// Get the ratio of single-precision to double-precision performance.
407    ///
408    /// A higher value means the GPU is relatively faster at FP32 than FP64.
409    pub fn single_to_double_perf_ratio(&self) -> CudaResult<i32> {
410        self.attribute(CUdevice_attribute::SingleToDoublePrecisionPerfRatio)
411    }
412
413    /// Check if the device supports cooperative multi-device kernel launches.
414    pub fn supports_cooperative_multi_device_launch(&self) -> CudaResult<bool> {
415        Ok(self.attribute(CUdevice_attribute::CooperativeMultiDeviceLaunch)? != 0)
416    }
417
418    /// Check if the device supports flushing outstanding remote writes.
419    pub fn supports_flush_remote_writes(&self) -> CudaResult<bool> {
420        Ok(self.attribute(CUdevice_attribute::CanFlushRemoteWrites)? != 0)
421    }
422
423    /// Check if the device supports host-side memory register functions.
424    pub fn supports_host_register(&self) -> CudaResult<bool> {
425        Ok(self.attribute(CUdevice_attribute::HostRegisterSupported)? != 0)
426    }
427
428    /// Check if the device can use host pointers for registered memory.
429    pub fn can_use_host_pointer_for_registered_mem(&self) -> CudaResult<bool> {
430        Ok(self.attribute(CUdevice_attribute::CanUseHostPointerForRegisteredMem)? != 0)
431    }
432
433    /// Check if the device supports GPU Direct RDMA.
434    pub fn supports_gpu_direct_rdma(&self) -> CudaResult<bool> {
435        Ok(self.attribute(CUdevice_attribute::GpuDirectRdmaSupported)? != 0)
436    }
437
438    /// Check if the device supports tensor-map access (Hopper+).
439    pub fn supports_tensor_map_access(&self) -> CudaResult<bool> {
440        Ok(self.attribute(CUdevice_attribute::TensorMapAccessSupported)? != 0)
441    }
442
443    /// Check if the device supports multicast operations.
444    pub fn supports_multicast(&self) -> CudaResult<bool> {
445        Ok(self.attribute(CUdevice_attribute::MulticastSupported)? != 0)
446    }
447
448    /// Check if Multi-Process Service (MPS) is enabled on the device.
449    pub fn mps_enabled(&self) -> CudaResult<bool> {
450        Ok(self.attribute(CUdevice_attribute::MpsEnabled)? != 0)
451    }
452
453    // -- Texture / surface limits -------------------------------------------
454
455    /// Get the maximum 1D texture width.
456    pub fn max_texture_1d_width(&self) -> CudaResult<i32> {
457        self.attribute(CUdevice_attribute::MaxTexture1DWidth)
458    }
459
460    /// Get the maximum 2D texture dimensions as `(width, height)`.
461    pub fn max_texture_2d_dims(&self) -> CudaResult<(i32, i32)> {
462        Ok((
463            self.attribute(CUdevice_attribute::MaxTexture2DWidth)?,
464            self.attribute(CUdevice_attribute::MaxTexture2DHeight)?,
465        ))
466    }
467
468    /// Get the maximum 3D texture dimensions as `(width, height, depth)`.
469    pub fn max_texture_3d_dims(&self) -> CudaResult<(i32, i32, i32)> {
470        Ok((
471            self.attribute(CUdevice_attribute::MaxTexture3DWidth)?,
472            self.attribute(CUdevice_attribute::MaxTexture3DHeight)?,
473            self.attribute(CUdevice_attribute::MaxTexture3DDepth)?,
474        ))
475    }
476
477    /// Check if the device can copy memory and execute a kernel concurrently.
478    pub fn gpu_overlap(&self) -> CudaResult<bool> {
479        Ok(self.attribute(CUdevice_attribute::GpuOverlap)? != 0)
480    }
481
482    /// Get the maximum pitch for memory copies in bytes.
483    pub fn max_pitch(&self) -> CudaResult<i32> {
484        self.attribute(CUdevice_attribute::MaxPitch)
485    }
486
487    /// Get the texture alignment requirement in bytes.
488    pub fn texture_alignment(&self) -> CudaResult<i32> {
489        self.attribute(CUdevice_attribute::TextureAlignment)
490    }
491
492    /// Get the surface alignment requirement in bytes.
493    pub fn surface_alignment(&self) -> CudaResult<i32> {
494        self.attribute(CUdevice_attribute::SurfaceAlignment)
495    }
496
497    /// Check if the device supports deferred mapping of CUDA arrays.
498    pub fn supports_deferred_mapping(&self) -> CudaResult<bool> {
499        Ok(self.attribute(CUdevice_attribute::DeferredMappingCudaArraySupported)? != 0)
500    }
501
502    // -- Memory pool / async features ---------------------------------------
503
504    /// Check if the device supports memory pools (`cudaMallocAsync`).
505    pub fn supports_memory_pools(&self) -> CudaResult<bool> {
506        Ok(self.attribute(CUdevice_attribute::MemoryPoolsSupported)? != 0)
507    }
508
509    /// Check if the device supports cluster launch (Hopper+).
510    pub fn supports_cluster_launch(&self) -> CudaResult<bool> {
511        Ok(self.attribute(CUdevice_attribute::ClusterLaunch)? != 0)
512    }
513
514    /// Check if the device supports virtual memory management APIs.
515    pub fn supports_virtual_memory_management(&self) -> CudaResult<bool> {
516        Ok(self.attribute(CUdevice_attribute::VirtualMemoryManagementSupported)? != 0)
517    }
518
519    /// Check if the device supports POSIX file descriptor handles for IPC.
520    pub fn supports_handle_type_posix_fd(&self) -> CudaResult<bool> {
521        Ok(self.attribute(CUdevice_attribute::HandleTypePosixFileDescriptorSupported)? != 0)
522    }
523
524    /// Check if the device supports Win32 handles for IPC.
525    pub fn supports_handle_type_win32(&self) -> CudaResult<bool> {
526        Ok(self.attribute(CUdevice_attribute::HandleTypeWin32HandleSupported)? != 0)
527    }
528
529    /// Check if the device supports Win32 KMT handles for IPC.
530    pub fn supports_handle_type_win32_kmt(&self) -> CudaResult<bool> {
531        Ok(self.attribute(CUdevice_attribute::HandleTypeWin32KmtHandleSupported)? != 0)
532    }
533
534    /// Check if the device supports GPU Direct RDMA with CUDA VMM.
535    pub fn supports_gpu_direct_rdma_vmm(&self) -> CudaResult<bool> {
536        Ok(self.attribute(CUdevice_attribute::GpuDirectRdmaWithCudaVmmSupported)? != 0)
537    }
538
539    /// Get the GPU Direct RDMA flush-writes options bitmask.
540    pub fn gpu_direct_rdma_flush_writes_options(&self) -> CudaResult<i32> {
541        self.attribute(CUdevice_attribute::GpuDirectRdmaFlushWritesOptions)
542    }
543
544    /// Get the GPU Direct RDMA writes ordering.
545    pub fn gpu_direct_rdma_writes_ordering(&self) -> CudaResult<i32> {
546        self.attribute(CUdevice_attribute::GpuDirectRdmaWritesOrdering)
547    }
548
549    /// Get the maximum access-policy window size for L2 cache.
550    pub fn max_access_policy_window_size(&self) -> CudaResult<i32> {
551        self.attribute(CUdevice_attribute::MaxAccessPolicyWindowSize)
552    }
553
554    /// Get the reserved shared memory per block in bytes.
555    pub fn reserved_shared_memory_per_block(&self) -> CudaResult<i32> {
556        self.attribute(CUdevice_attribute::ReservedSharedMemoryPerBlock)
557    }
558
559    /// Check if timeline semaphore interop is supported.
560    pub fn supports_timeline_semaphore_interop(&self) -> CudaResult<bool> {
561        Ok(self.attribute(CUdevice_attribute::TimelineSemaphoreInteropSupported)? != 0)
562    }
563
564    /// Check if memory sync domain operations are supported.
565    pub fn supports_mem_sync_domain(&self) -> CudaResult<bool> {
566        Ok(self.attribute(CUdevice_attribute::MemSyncDomainSupported)? != 0)
567    }
568
569    /// Get the number of memory sync domains.
570    pub fn mem_sync_domain_count(&self) -> CudaResult<i32> {
571        self.attribute(CUdevice_attribute::MemSyncDomainCount)
572    }
573
574    /// Check if GPU-Direct Fabric (RDMA) is supported.
575    pub fn supports_gpu_direct_rdma_fabric(&self) -> CudaResult<bool> {
576        Ok(self.attribute(CUdevice_attribute::GpuDirectRdmaFabricSupported)? != 0)
577    }
578
579    /// Check if unified function pointers are supported.
580    pub fn supports_unified_function_pointers(&self) -> CudaResult<bool> {
581        Ok(self.attribute(CUdevice_attribute::UnifiedFunctionPointers)? != 0)
582    }
583
584    /// Check if IPC event handles are supported.
585    pub fn supports_ipc_events(&self) -> CudaResult<bool> {
586        Ok(self.attribute(CUdevice_attribute::IpcEventSupported)? != 0)
587    }
588
589    /// Get the NUMA configuration of the device.
590    pub fn numa_config(&self) -> CudaResult<i32> {
591        self.attribute(CUdevice_attribute::NumaConfig)
592    }
593
594    /// Get the NUMA ID of the device.
595    pub fn numa_id(&self) -> CudaResult<i32> {
596        self.attribute(CUdevice_attribute::NumaId)
597    }
598
599    /// Get the host NUMA ID of the device.
600    pub fn host_numa_id(&self) -> CudaResult<i32> {
601        self.attribute(CUdevice_attribute::HostNumaId)
602    }
603
604    /// Get the texture pitch alignment requirement in bytes.
605    pub fn texture_pitch_alignment(&self) -> CudaResult<i32> {
606        self.attribute(CUdevice_attribute::TexturePitchAlignment)
607    }
608
609    // -- Structured info ----------------------------------------------------
610
611    /// Gather comprehensive device information in a single call.
612    ///
613    /// Returns a [`DeviceInfo`] with all key properties. Individual attribute
614    /// query failures are silently replaced with default values (`0` / `false`)
615    /// so that the call succeeds even on older drivers that lack some attributes.
616    ///
617    /// # Errors
618    ///
619    /// Returns an error only if the device name or total memory cannot be
620    /// queried (fundamental properties).
621    pub fn info(&self) -> CudaResult<DeviceInfo> {
622        let name = self.name()?;
623        let total_memory_bytes = self.total_memory()?;
624        let (cc_major, cc_minor) = self.compute_capability().unwrap_or((0, 0));
625
626        Ok(DeviceInfo {
627            name,
628            ordinal: self.ordinal,
629            compute_capability: (cc_major, cc_minor),
630            total_memory_bytes,
631            multiprocessor_count: self.multiprocessor_count().unwrap_or(0),
632            max_threads_per_block: self.max_threads_per_block().unwrap_or(0),
633            max_threads_per_sm: self.max_threads_per_multiprocessor().unwrap_or(0),
634            warp_size: self.warp_size().unwrap_or(0),
635            clock_rate_mhz: self.clock_rate_khz().unwrap_or(0) as f64 / 1000.0,
636            memory_clock_rate_mhz: self.memory_clock_rate_khz().unwrap_or(0) as f64 / 1000.0,
637            memory_bus_width_bits: self.memory_bus_width().unwrap_or(0),
638            l2_cache_bytes: self.l2_cache_size().unwrap_or(0),
639            max_shared_memory_per_block: self.max_shared_memory_per_block().unwrap_or(0),
640            max_shared_memory_per_sm: self.max_shared_memory_per_multiprocessor().unwrap_or(0),
641            max_registers_per_block: self.max_registers_per_block().unwrap_or(0),
642            ecc_enabled: self.ecc_enabled().unwrap_or(false),
643            tcc_driver: self.tcc_driver().unwrap_or(false),
644            compute_mode: self.compute_mode().unwrap_or(0),
645            supports_cooperative_launch: self.supports_cooperative_launch().unwrap_or(false),
646            supports_managed_memory: self.supports_managed_memory().unwrap_or(false),
647            max_persisting_l2_cache_bytes: self.max_persisting_l2_cache_size().unwrap_or(0),
648            async_engine_count: self.async_engine_count().unwrap_or(0),
649            supports_memory_pools: self.supports_memory_pools().unwrap_or(false),
650            supports_gpu_direct_rdma: self.supports_gpu_direct_rdma().unwrap_or(false),
651            supports_cluster_launch: self.supports_cluster_launch().unwrap_or(false),
652            supports_concurrent_kernels: self.supports_concurrent_kernels().unwrap_or(false),
653            supports_unified_addressing: self.supports_unified_addressing().unwrap_or(false),
654            max_blocks_per_sm: self.max_blocks_per_multiprocessor().unwrap_or(0),
655            single_to_double_perf_ratio: self.single_to_double_perf_ratio().unwrap_or(0),
656        })
657    }
658
659    // -- Raw access ---------------------------------------------------------
660
661    /// Get the raw `CUdevice` handle for use with FFI calls.
662    #[inline]
663    pub fn raw(&self) -> CUdevice {
664        self.raw
665    }
666
667    /// Get the device ordinal that was used to obtain this handle.
668    #[inline]
669    pub fn ordinal(&self) -> i32 {
670        self.ordinal
671    }
672}
673
674impl std::fmt::Display for Device {
675    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
676        write!(f, "Device({})", self.ordinal)
677    }
678}
679
680// ---------------------------------------------------------------------------
681// DeviceInfo — structured summary
682// ---------------------------------------------------------------------------
683
684/// Comprehensive device information gathered in a single call.
685///
686/// All fields are populated via convenience methods on [`Device`]. Fields that
687/// fail to query (e.g. on an older driver) default to `0` / `false`.
688#[derive(Debug, Clone)]
689pub struct DeviceInfo {
690    /// Human-readable device name.
691    pub name: String,
692    /// Zero-based device ordinal.
693    pub ordinal: i32,
694    /// Compute capability `(major, minor)`.
695    pub compute_capability: (i32, i32),
696    /// Total device memory in bytes.
697    pub total_memory_bytes: usize,
698    /// Number of streaming multiprocessors.
699    pub multiprocessor_count: i32,
700    /// Maximum threads per block.
701    pub max_threads_per_block: i32,
702    /// Maximum threads per streaming multiprocessor.
703    pub max_threads_per_sm: i32,
704    /// Warp size in threads.
705    pub warp_size: i32,
706    /// Core clock rate in MHz.
707    pub clock_rate_mhz: f64,
708    /// Memory clock rate in MHz.
709    pub memory_clock_rate_mhz: f64,
710    /// Memory bus width in bits.
711    pub memory_bus_width_bits: i32,
712    /// L2 cache size in bytes.
713    pub l2_cache_bytes: i32,
714    /// Maximum shared memory per block in bytes.
715    pub max_shared_memory_per_block: i32,
716    /// Maximum shared memory per SM in bytes.
717    pub max_shared_memory_per_sm: i32,
718    /// Maximum 32-bit registers per block.
719    pub max_registers_per_block: i32,
720    /// ECC memory enabled.
721    pub ecc_enabled: bool,
722    /// TCC driver mode.
723    pub tcc_driver: bool,
724    /// Compute mode (0=default, 1=exclusive-thread, 2=prohibited, 3=exclusive-process).
725    pub compute_mode: i32,
726    /// Cooperative kernel launch support.
727    pub supports_cooperative_launch: bool,
728    /// Managed (unified) memory support.
729    pub supports_managed_memory: bool,
730    /// Maximum persisting L2 cache size in bytes (Ampere+).
731    pub max_persisting_l2_cache_bytes: i32,
732    /// Number of async copy engines.
733    pub async_engine_count: i32,
734    /// Supports memory pools (`cudaMallocAsync`).
735    pub supports_memory_pools: bool,
736    /// Supports GPU Direct RDMA.
737    pub supports_gpu_direct_rdma: bool,
738    /// Supports cluster launch (Hopper+).
739    pub supports_cluster_launch: bool,
740    /// Concurrent kernel execution supported.
741    pub supports_concurrent_kernels: bool,
742    /// Unified addressing supported.
743    pub supports_unified_addressing: bool,
744    /// Maximum blocks per SM.
745    pub max_blocks_per_sm: i32,
746    /// Single-to-double precision performance ratio.
747    pub single_to_double_perf_ratio: i32,
748}
749
750impl std::fmt::Display for DeviceInfo {
751    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
752        let mem_mb = self.total_memory_bytes / (1024 * 1024);
753        let (major, minor) = self.compute_capability;
754        writeln!(f, "Device {}: {}", self.ordinal, self.name)?;
755        writeln!(f, "  Compute capability : {major}.{minor}")?;
756        writeln!(f, "  Total memory       : {mem_mb} MB")?;
757        writeln!(f, "  SMs                : {}", self.multiprocessor_count)?;
758        writeln!(f, "  Max threads/block  : {}", self.max_threads_per_block)?;
759        writeln!(f, "  Max threads/SM     : {}", self.max_threads_per_sm)?;
760        writeln!(f, "  Warp size          : {}", self.warp_size)?;
761        writeln!(f, "  Core clock         : {:.1} MHz", self.clock_rate_mhz)?;
762        writeln!(
763            f,
764            "  Memory clock       : {:.1} MHz",
765            self.memory_clock_rate_mhz
766        )?;
767        writeln!(
768            f,
769            "  Memory bus         : {} bits",
770            self.memory_bus_width_bits
771        )?;
772        writeln!(
773            f,
774            "  L2 cache           : {} KB",
775            self.l2_cache_bytes / 1024
776        )?;
777        writeln!(
778            f,
779            "  Shared mem/block   : {} KB",
780            self.max_shared_memory_per_block / 1024
781        )?;
782        writeln!(
783            f,
784            "  Shared mem/SM      : {} KB",
785            self.max_shared_memory_per_sm / 1024
786        )?;
787        writeln!(f, "  Registers/block    : {}", self.max_registers_per_block)?;
788        writeln!(f, "  ECC                : {}", self.ecc_enabled)?;
789        writeln!(f, "  TCC driver         : {}", self.tcc_driver)?;
790        writeln!(f, "  Compute mode       : {}", self.compute_mode)?;
791        writeln!(
792            f,
793            "  Cooperative launch : {}",
794            self.supports_cooperative_launch
795        )?;
796        writeln!(f, "  Managed memory     : {}", self.supports_managed_memory)?;
797        writeln!(
798            f,
799            "  Persist L2 cache   : {} KB",
800            self.max_persisting_l2_cache_bytes / 1024
801        )?;
802        writeln!(f, "  Async engines      : {}", self.async_engine_count)?;
803        writeln!(f, "  Memory pools       : {}", self.supports_memory_pools)?;
804        writeln!(
805            f,
806            "  GPU Direct RDMA    : {}",
807            self.supports_gpu_direct_rdma
808        )?;
809        writeln!(f, "  Cluster launch     : {}", self.supports_cluster_launch)?;
810        writeln!(
811            f,
812            "  Concurrent kernels : {}",
813            self.supports_concurrent_kernels
814        )?;
815        writeln!(
816            f,
817            "  Unified addressing : {}",
818            self.supports_unified_addressing
819        )?;
820        writeln!(f, "  Max blocks/SM      : {}", self.max_blocks_per_sm)?;
821        write!(
822            f,
823            "  FP32/FP64 ratio    : {}",
824            self.single_to_double_perf_ratio
825        )
826    }
827}
828
829// ---------------------------------------------------------------------------
830// Free functions
831// ---------------------------------------------------------------------------
832
833/// List all available CUDA devices.
834///
835/// Returns a vector of [`Device`] handles for every GPU visible to the driver.
836/// The vector is empty if no CUDA-capable devices are present.
837///
838/// # Errors
839///
840/// Returns an error if the driver cannot be loaded or device enumeration fails.
841///
842/// # Examples
843///
844/// ```no_run
845/// use oxicuda_driver::device::list_devices;
846///
847/// oxicuda_driver::init()?;
848/// for dev in list_devices()? {
849///     println!("{}: {} MB", dev.name()?, dev.total_memory()? / (1024 * 1024));
850/// }
851/// # Ok::<(), oxicuda_driver::error::CudaError>(())
852/// ```
853pub fn list_devices() -> CudaResult<Vec<Device>> {
854    let count = Device::count()?;
855    let mut devices = Vec::with_capacity(count as usize);
856    for i in 0..count {
857        devices.push(Device::get(i)?);
858    }
859    Ok(devices)
860}
861
862/// Query the CUDA driver version.
863///
864/// Returns the version as `major * 1000 + minor * 10`, e.g. 12060 for CUDA 12.6.
865/// This is a system-wide property that does not depend on a specific device.
866///
867/// # Errors
868///
869/// Returns an error if the driver cannot be loaded or `cuDriverGetVersion` fails.
870pub fn driver_version() -> CudaResult<i32> {
871    let driver = try_driver()?;
872    let mut version: c_int = 0;
873    crate::error::check(unsafe { (driver.cu_driver_get_version)(&mut version) })?;
874    Ok(version)
875}
876
877/// Query whether peer access is possible between two devices.
878///
879/// Peer access allows one GPU to directly access another GPU's memory
880/// without going through the host. This requires both devices to support
881/// peer access and be connected via a suitable interconnect (e.g., NVLink
882/// or PCIe peer-to-peer).
883///
884/// # Parameters
885///
886/// * `device` — the device that would initiate the access.
887/// * `peer` — the device whose memory would be accessed.
888///
889/// # Errors
890///
891/// Returns an error if the driver cannot be loaded or the query fails.
892pub fn can_access_peer(device: &Device, peer: &Device) -> CudaResult<bool> {
893    let driver = try_driver()?;
894    let mut can_access: c_int = 0;
895    crate::error::check(unsafe {
896        (driver.cu_device_can_access_peer)(&mut can_access, device.raw(), peer.raw())
897    })?;
898    Ok(can_access != 0)
899}
900
901/// Find the device with the most total memory.
902///
903/// Returns `None` if no CUDA devices are available.
904///
905/// # Errors
906///
907/// Returns an error if device enumeration or memory queries fail.
908///
909/// # Examples
910///
911/// ```no_run
912/// use oxicuda_driver::device::best_device;
913///
914/// oxicuda_driver::init()?;
915/// if let Some(dev) = best_device()? {
916///     println!("Best GPU: {} ({} MB)", dev.name()?, dev.total_memory()? / (1024 * 1024));
917/// }
918/// # Ok::<(), oxicuda_driver::error::CudaError>(())
919/// ```
920pub fn best_device() -> CudaResult<Option<Device>> {
921    let devices = list_devices()?;
922    if devices.is_empty() {
923        return Ok(None);
924    }
925    let mut best = devices[0];
926    let mut best_mem = best.total_memory()?;
927    for dev in devices.iter().skip(1) {
928        let mem = dev.total_memory()?;
929        if mem > best_mem {
930            best = *dev;
931            best_mem = mem;
932        }
933    }
934    Ok(Some(best))
935}