Skip to main content

singe_cuda/
device.rs

1#![allow(deprecated)]
2
3use std::{
4    ffi::CString,
5    mem::{self, MaybeUninit},
6};
7
8use num_enum::{IntoPrimitive, TryFromPrimitive};
9use singe_core::{impl_enum_conversion, impl_enum_display, string_from_c_chars};
10use singe_cuda_sys::driver;
11use singe_cuda_sys::runtime;
12
13use crate::{
14    context::ContextFlags,
15    error::{Error, Result},
16    try_ffi,
17    types::FunctionCache,
18};
19
20/// CUDA Limits.
21#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
22#[repr(u32)]
23#[non_exhaustive]
24pub enum Limit {
25    /// GPU thread stack size.
26    StackSize = runtime::cudaLimit::cudaLimitStackSize as _,
27    /// GPU printf FIFO size.
28    PrintfFifoSize = runtime::cudaLimit::cudaLimitPrintfFifoSize as _,
29    /// GPU malloc heap size.
30    MallocHeapSize = runtime::cudaLimit::cudaLimitMallocHeapSize as _,
31    /// GPU device runtime synchronize depth.
32    DevRuntimeSyncDepth = runtime::cudaLimit::cudaLimitDevRuntimeSyncDepth as _,
33    /// GPU device runtime pending launch count.
34    DevRuntimePendingLaunchCount = runtime::cudaLimit::cudaLimitDevRuntimePendingLaunchCount as _,
35    /// A value between 0 and 128 that indicates the maximum fetch granularity of L2, in bytes.
36    /// The value is a hint.
37    MaxL2FetchGranularity = runtime::cudaLimit::cudaLimitMaxL2FetchGranularity as _,
38    /// A size in bytes for L2 persisting lines cache size.
39    PersistingL2CacheSize = runtime::cudaLimit::cudaLimitPersistingL2CacheSize as _,
40}
41
42impl_enum_conversion!(runtime::cudaLimit, Limit);
43
44impl_enum_display!(Limit, {
45    Self::StackSize => "cudaLimitStackSize",
46    Self::PrintfFifoSize => "cudaLimitPrintfFifoSize",
47    Self::MallocHeapSize => "cudaLimitMallocHeapSize",
48    Self::DevRuntimeSyncDepth => "cudaLimitDevRuntimeSyncDepth",
49    Self::DevRuntimePendingLaunchCount => "cudaLimitDevRuntimePendingLaunchCount",
50    Self::MaxL2FetchGranularity => "cudaLimitMaxL2FetchGranularity",
51    Self::PersistingL2CacheSize => "cudaLimitPersistingL2CacheSize",
52});
53
54/// CUDA device compute modes.
55#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
56#[repr(u32)]
57#[non_exhaustive]
58pub enum ComputeMode {
59    /// Default compute mode; multiple threads can use [`Device::set_current`] with this device.
60    Default = runtime::cudaComputeMode::cudaComputeModeDefault as _,
61    /// Compute-exclusive-thread mode; only one thread in one process can use [`Device::set_current`] with this device.
62    Exclusive = runtime::cudaComputeMode::cudaComputeModeExclusive as _,
63    /// Compute-prohibited mode; no threads can use [`Device::set_current`] with this device.
64    Prohibited = runtime::cudaComputeMode::cudaComputeModeProhibited as _,
65    /// Compute-exclusive-process mode; many threads in one process can use [`Device::set_current`] with this device.
66    ExclusiveProcess = runtime::cudaComputeMode::cudaComputeModeExclusiveProcess as _,
67}
68
69impl_enum_conversion!(runtime::cudaComputeMode, ComputeMode);
70
71bitflags::bitflags! {
72    /// Flags for [`Device::enable_peer_access`].
73    #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
74    pub struct PeerAccessFlags: u32 {
75        /// Default peer-access behavior.
76        const DEFAULT = runtime::cudaPeerAccessDefault;
77    }
78}
79
80/// Attributes queryable between two devices using [`Device::p2p_attribute`].
81#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, TryFromPrimitive, IntoPrimitive)]
82#[repr(u32)]
83#[non_exhaustive]
84pub enum PeerToPeerAttribute {
85    PerformanceRank = runtime::cudaDeviceP2PAttr::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK as _,
86    AccessSupported = runtime::cudaDeviceP2PAttr::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED as _,
87    NativeAtomicSupported =
88        runtime::cudaDeviceP2PAttr::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED as _,
89    #[deprecated]
90    CudaArrayAccessSupported =
91        runtime::cudaDeviceP2PAttr::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED as _,
92}
93
94impl_enum_conversion!(runtime::cudaDeviceP2PAttr, PeerToPeerAttribute);
95
96impl_enum_display!(PeerToPeerAttribute, {
97    Self::PerformanceRank => "cudaDevP2PAttrPerformanceRank",
98    Self::AccessSupported => "cudaDevP2PAttrAccessSupported",
99    Self::NativeAtomicSupported => "cudaDevP2PAttrNativeAtomicSupported",
100    Self::CudaArrayAccessSupported => "cudaDevP2PAttrCudaArrayAccessSupported",
101});
102
103#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
104pub struct StreamPriorityRange {
105    pub least: i32,
106    pub greatest: i32,
107}
108
109// TODO: use a crate
110#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
111pub struct Uuid {
112    pub bytes: [u8; 16],
113}
114
115impl From<driver::CUuuid> for Uuid {
116    fn from(value: driver::CUuuid) -> Self {
117        Self {
118            bytes: value.bytes.map(|byte| byte as u8),
119        }
120    }
121}
122
123impl From<Uuid> for driver::CUuuid {
124    fn from(value: Uuid) -> Self {
125        driver::CUuuid {
126            bytes: value.bytes.map(|byte| byte as _),
127        }
128    }
129}
130
131/// Rust representation of CUDA device properties.
132#[derive(Debug, Clone)]
133pub struct DeviceProperties {
134    /// ASCII string identifying device.
135    pub name: String,
136    /// 16-byte unique identifier.
137    pub uuid: Uuid,
138    /// 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms.
139    pub luid: [u8; 8],
140    /// LUID device node mask. Value is undefined on TCC and non-Windows platforms.
141    pub luid_device_node_mask: u32,
142    /// Global memory available on device in bytes.
143    pub total_global_mem: usize,
144    /// Shared memory available per block in bytes.
145    pub shared_mem_per_block: usize,
146    /// 32-bit registers available per block.
147    pub regs_per_block: i32,
148    /// Warp size in threads.
149    pub warp_size: i32,
150    /// Maximum pitch in bytes allowed by memory copies.
151    pub mem_pitch: usize,
152    /// Maximum number of threads per block.
153    pub max_threads_per_block: i32,
154    /// Maximum size of each dimension of a block.
155    pub max_threads_dim: [i32; 3],
156    /// Maximum size of each dimension of a grid.
157    pub max_grid_size: [i32; 3],
158    /// Constant memory available on device in bytes.
159    pub total_const_mem: usize,
160    /// Major compute capability.
161    pub major: i32,
162    /// Minor compute capability.
163    pub minor: i32,
164    /// Alignment requirement for textures.
165    pub texture_alignment: usize,
166    /// Pitch alignment requirement for texture references bound to pitched memory.
167    pub texture_pitch_alignment: usize,
168    /// Number of multiprocessors on device.
169    pub multi_processor_count: i32,
170    /// Device is integrated as opposed to discrete.
171    pub integrated: bool,
172    /// Device can map host memory into CUDA address space.
173    pub can_map_host_memory: bool,
174    /// Maximum 1D texture size.
175    pub max_texture1d: i32,
176    /// Maximum 1D mipmapped texture size.
177    pub max_texture1d_mipmap: i32,
178    /// Maximum 2D texture dimensions.
179    pub max_texture2d: [i32; 2],
180    /// Maximum 2D mipmapped texture dimensions.
181    pub max_texture2d_mipmap: [i32; 2],
182    /// Maximum dimensions (width, height, pitch) for 2D textures bound to linear memory.
183    pub max_texture2d_linear: [i32; 3],
184    /// Maximum 2D texture dimensions for texture gather operations.
185    pub max_texture2d_gather: [i32; 2],
186    /// Maximum 3D texture dimensions.
187    pub max_texture3d: [i32; 3],
188    /// Maximum alternate 3D texture dimensions.
189    pub max_texture3d_alt: [i32; 3],
190    /// Maximum Cubemap texture dimensions.
191    pub max_texture_cubemap: i32,
192    /// Maximum 1D layered texture dimensions.
193    pub max_texture1d_layered: [i32; 2],
194    /// Maximum 2D layered texture dimensions.
195    pub max_texture2d_layered: [i32; 3],
196    /// Maximum Cubemap layered texture dimensions.
197    pub max_texture_cubemap_layered: [i32; 2],
198    /// Maximum 1D surface size.
199    pub max_surface1d: i32,
200    /// Maximum 2D surface dimensions.
201    pub max_surface2d: [i32; 2],
202    /// Maximum 3D surface dimensions.
203    pub max_surface3d: [i32; 3],
204    /// Maximum 1D layered surface dimensions.
205    pub max_surface1d_layered: [i32; 2],
206    /// Maximum 2D layered surface dimensions.
207    pub max_surface2d_layered: [i32; 3],
208    /// Maximum Cubemap surface dimensions.
209    pub max_surface_cubemap: i32,
210    /// Maximum Cubemap layered surface dimensions.
211    pub max_surface_cubemap_layered: [i32; 2],
212    /// Alignment requirements for surfaces.
213    pub surface_alignment: usize,
214    /// Device can possibly execute multiple kernels concurrently.
215    pub concurrent_kernels: bool,
216    /// Device has ECC support enabled.
217    pub ecc_enabled: bool,
218    /// PCI bus ID of the device.
219    pub pci_bus_id: i32,
220    /// PCI device ID of the device.
221    pub pci_device_id: i32,
222    /// PCI domain ID of the device.
223    pub pci_domain_id: i32,
224    /// 1 if device is a Tesla device using TCC driver, 0 otherwise.
225    pub tcc_driver: bool,
226    /// Number of asynchronous engines.
227    pub async_engine_count: i32,
228    /// Device shares a unified address space with the host.
229    pub unified_addressing: bool,
230    /// Global memory bus width in bits.
231    pub memory_bus_width: i32,
232    /// Size of L2 cache in bytes.
233    pub l2_cache_size: i32,
234    /// Device's maximum l2 persisting lines capacity setting in bytes.
235    pub persisting_l2_cache_max_size: i32,
236    /// Maximum resident threads per multiprocessor.
237    pub max_threads_per_multi_processor: i32,
238    /// Device supports stream priorities.
239    pub stream_priorities_supported: bool,
240    /// Device supports caching globals in L1.
241    pub global_l1_cache_supported: bool,
242    /// Device supports caching locals in L1.
243    pub local_l1_cache_supported: bool,
244    /// Shared memory available per multiprocessor in bytes.
245    pub shared_mem_per_multiprocessor: usize,
246    /// 32-bit registers available per multiprocessor.
247    pub regs_per_multiprocessor: i32,
248    /// Device supports allocating managed memory on this system.
249    pub managed_memory: bool,
250    /// Device is on a multi-GPU board.
251    pub is_multi_gpu_board: bool,
252    /// Unique identifier for a group of devices on the same multi-GPU board.
253    pub multi_gpu_board_group_id: i32,
254    /// Link between the device and the host supports native atomic operations.
255    pub host_native_atomic_supported: bool,
256    /// Device supports coherently accessing pageable memory without calling [`DeviceMemory::register_host`](crate::memory::DeviceMemory::register_host) on it.
257    pub pageable_memory_access: bool,
258    /// Device can coherently access managed memory concurrently with the CPU.
259    pub concurrent_managed_access: bool,
260    /// Device supports Compute Preemption.
261    pub compute_preemption_supported: bool,
262    /// Device can access host registered memory at the same virtual address as the CPU.
263    pub can_use_host_pointer_for_registered_mem: bool,
264    /// Device supports cooperative kernel launches.
265    pub cooperative_launch: bool,
266    /// Per device maximum shared memory per block usable by special opt-in.
267    pub shared_mem_per_block_optin: usize,
268    /// Device accesses pageable memory via the host's page tables.
269    pub pageable_memory_access_uses_host_page_tables: bool,
270    /// Host can directly access managed memory on the device without migration.
271    pub direct_managed_mem_access_from_host: bool,
272    /// Maximum number of resident blocks per multiprocessor.
273    pub max_blocks_per_multi_processor: i32,
274    /// Maximum value of the CUDA access-policy window `num_bytes` field.
275    pub access_policy_max_window_size: i32,
276    /// Shared memory reserved by CUDA driver per block in bytes.
277    pub reserved_shared_mem_per_block: usize,
278    /// Device supports host memory registration via [`DeviceMemory::register_host`](crate::memory::DeviceMemory::register_host).
279    pub host_register_supported: bool,
280    /// Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays.
281    pub sparse_cuda_array_supported: bool,
282    /// Device supports [`HostRegisterFlags::READ_ONLY`](crate::memory::HostRegisterFlags::READ_ONLY) for host registrations mapped as read-only to the GPU.
283    pub host_register_read_only_supported: bool,
284    /// External timeline semaphore interop is supported.
285    pub timeline_semaphore_interop_supported: bool,
286    /// Device supports CUDA memory pools.
287    pub memory_pools_supported: bool,
288    /// Device supports GPUDirect RDMA APIs.
289    pub gpu_direct_rdma_supported: bool,
290    /// The returned flags may be used as subset of the supported write ordering MASTs supplied with GPUDirect RDMA writes.
291    pub gpu_direct_rdma_flush_writes_options: u32,
292    /// GPUDirect RDMA writes are guaranteed to be ordered with respect to other GPUDirect RDMA writes from the same GPU.
293    pub gpu_direct_rdma_writes_ordering: i32,
294    /// Handle types supported with mempool based IPC.
295    pub memory_pool_supported_handle_types: u32,
296    /// Indicates device supports deferred mapping CUDA arrays and mapping hints.
297    pub deferred_mapping_cuda_array_supported: bool,
298    /// Device supports IPC Events.
299    pub ipc_event_supported: bool,
300    /// Device supports Cluster Launch.
301    pub cluster_launch: bool,
302    /// Device supports unified function pointers.
303    pub unified_function_pointers: bool,
304}
305
306impl TryFrom<runtime::cudaDeviceProp> for DeviceProperties {
307    type Error = Error;
308
309    fn try_from(value: runtime::cudaDeviceProp) -> Result<Self> {
310        let end = value
311            .name
312            .iter()
313            .position(|&c| c == 0)
314            .unwrap_or(value.name.len());
315        let name_bytes: Vec<u8> = value.name[..end].iter().map(|&byte| byte as u8).collect();
316        let name = String::from_utf8_lossy(&name_bytes).into_owned();
317
318        let prop = Self {
319            name,
320            uuid: value.uuid.into(),
321            luid: value.luid.map(|byte| byte as u8),
322            luid_device_node_mask: value.luidDeviceNodeMask,
323            total_global_mem: value.totalGlobalMem as usize,
324            shared_mem_per_block: value.sharedMemPerBlock as usize,
325            regs_per_block: value.regsPerBlock,
326            warp_size: value.warpSize,
327            mem_pitch: value.memPitch as usize,
328            max_threads_per_block: value.maxThreadsPerBlock,
329            max_threads_dim: value.maxThreadsDim,
330            max_grid_size: value.maxGridSize,
331            total_const_mem: value.totalConstMem as usize,
332            major: value.major,
333            minor: value.minor,
334            texture_alignment: value.textureAlignment as usize,
335            texture_pitch_alignment: value.texturePitchAlignment as usize,
336            multi_processor_count: value.multiProcessorCount,
337            integrated: value.integrated != 0,
338            can_map_host_memory: value.canMapHostMemory != 0,
339            max_texture1d: value.maxTexture1D,
340            max_texture1d_mipmap: value.maxTexture1DMipmap,
341            max_texture2d: value.maxTexture2D,
342            max_texture2d_mipmap: value.maxTexture2DMipmap,
343            max_texture2d_linear: value.maxTexture2DLinear,
344            max_texture2d_gather: value.maxTexture2DGather,
345            max_texture3d: value.maxTexture3D,
346            max_texture3d_alt: value.maxTexture3DAlt,
347            max_texture_cubemap: value.maxTextureCubemap,
348            max_texture1d_layered: value.maxTexture1DLayered,
349            max_texture2d_layered: value.maxTexture2DLayered,
350            max_texture_cubemap_layered: value.maxTextureCubemapLayered,
351            max_surface1d: value.maxSurface1D,
352            max_surface2d: value.maxSurface2D,
353            max_surface3d: value.maxSurface3D,
354            max_surface1d_layered: value.maxSurface1DLayered,
355            max_surface2d_layered: value.maxSurface2DLayered,
356            max_surface_cubemap: value.maxSurfaceCubemap,
357            max_surface_cubemap_layered: value.maxSurfaceCubemapLayered,
358            surface_alignment: value.surfaceAlignment as usize,
359            concurrent_kernels: value.concurrentKernels != 0,
360            ecc_enabled: value.ECCEnabled != 0,
361            pci_bus_id: value.pciBusID,
362            pci_device_id: value.pciDeviceID,
363            pci_domain_id: value.pciDomainID,
364            tcc_driver: value.tccDriver != 0,
365            async_engine_count: value.asyncEngineCount,
366            unified_addressing: value.unifiedAddressing != 0,
367            memory_bus_width: value.memoryBusWidth,
368            l2_cache_size: value.l2CacheSize,
369            persisting_l2_cache_max_size: value.persistingL2CacheMaxSize,
370            max_threads_per_multi_processor: value.maxThreadsPerMultiProcessor,
371            stream_priorities_supported: value.streamPrioritiesSupported != 0,
372            global_l1_cache_supported: value.globalL1CacheSupported != 0,
373            local_l1_cache_supported: value.localL1CacheSupported != 0,
374            shared_mem_per_multiprocessor: value.sharedMemPerMultiprocessor as usize,
375            regs_per_multiprocessor: value.regsPerMultiprocessor,
376            managed_memory: value.managedMemory != 0,
377            is_multi_gpu_board: value.isMultiGpuBoard != 0,
378            multi_gpu_board_group_id: value.multiGpuBoardGroupID,
379            host_native_atomic_supported: value.hostNativeAtomicSupported != 0,
380            pageable_memory_access: value.pageableMemoryAccess != 0,
381            concurrent_managed_access: value.concurrentManagedAccess != 0,
382            compute_preemption_supported: value.computePreemptionSupported != 0,
383            can_use_host_pointer_for_registered_mem: value.canUseHostPointerForRegisteredMem != 0,
384            cooperative_launch: value.cooperativeLaunch != 0,
385            shared_mem_per_block_optin: value.sharedMemPerBlockOptin as usize,
386            pageable_memory_access_uses_host_page_tables: value
387                .pageableMemoryAccessUsesHostPageTables
388                != 0,
389            direct_managed_mem_access_from_host: value.directManagedMemAccessFromHost != 0,
390            max_blocks_per_multi_processor: value.maxBlocksPerMultiProcessor,
391            access_policy_max_window_size: value.accessPolicyMaxWindowSize,
392            reserved_shared_mem_per_block: value.reservedSharedMemPerBlock as usize,
393            host_register_supported: value.hostRegisterSupported != 0,
394            sparse_cuda_array_supported: value.sparseCudaArraySupported != 0,
395            host_register_read_only_supported: value.hostRegisterReadOnlySupported != 0,
396            timeline_semaphore_interop_supported: value.timelineSemaphoreInteropSupported != 0,
397            memory_pools_supported: value.memoryPoolsSupported != 0,
398            gpu_direct_rdma_supported: value.gpuDirectRDMASupported != 0,
399            gpu_direct_rdma_flush_writes_options: value.gpuDirectRDMAFlushWritesOptions,
400            gpu_direct_rdma_writes_ordering: value.gpuDirectRDMAWritesOrdering,
401            memory_pool_supported_handle_types: value.memoryPoolSupportedHandleTypes,
402            deferred_mapping_cuda_array_supported: value.deferredMappingCudaArraySupported != 0,
403            ipc_event_supported: value.ipcEventSupported != 0,
404            cluster_launch: value.clusterLaunch != 0,
405            unified_function_pointers: value.unifiedFunctionPointers != 0,
406        };
407
408        Ok(prop)
409    }
410}
411
412#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
413pub struct Device(DeviceId);
414
415pub type DeviceId = i32;
416
417impl Device {
418    pub const fn new(id: DeviceId) -> Self {
419        Self(id)
420    }
421
422    /// Returns the number of devices with compute capability greater or equal to 2.0 that are available for execution.
423    ///
424    /// # Errors
425    ///
426    /// Returns an error if CUDA cannot query the device count, a previous
427    /// asynchronous launch reports an error, or CUDA reports runtime
428    /// initialization diagnostics such as [`crate::error::Status::NotInitialized`],
429    /// [`crate::error::Status::CallRequiresNewerDriver`], or [`crate::error::Status::NoDevice`].
430    pub fn count() -> Result<i32> {
431        let mut count: i32 = 0;
432        unsafe {
433            try_ffi!(runtime::cudaGetDeviceCount(&raw mut count))?;
434        }
435        Ok(count)
436    }
437
438    /// Returns the current device for the calling host thread.
439    ///
440    /// # Errors
441    ///
442    /// Returns an error if CUDA cannot query the current device, a previous
443    /// asynchronous launch reports an error, or CUDA reports runtime
444    /// initialization diagnostics.
445    pub fn current() -> Result<Self> {
446        let mut device_id: i32 = 0;
447        unsafe {
448            try_ffi!(runtime::cudaGetDevice(&raw mut device_id))?;
449        }
450        Ok(Self(device_id))
451    }
452
453    /// Blocks until the device has completed all preceding requested tasks.
454    /// [`Device::synchronize`] returns an error if one of the preceding tasks has failed.
455    /// If [`ContextFlags::SCHEDULE_BLOCKING_SYNC`] was set for this device, the
456    /// host thread blocks until the device has finished its work.
457    ///
458    /// * Use of [`Device::synchronize`] in device code was deprecated in CUDA 11.6 and removed for compute_90+ compilation.
459    ///   For compute capability &lt; 9.0, compile-time opt-in with `-D CUDA_FORCE_CDP1_IF_SUPPORTED` is required to continue using [`Device::synchronize`] in device code for now.
460    ///   This is different from host-side [`Device::synchronize`], which is still supported.
461    ///
462    /// # Errors
463    ///
464    /// Returns an error if synchronization fails, a previous asynchronous
465    /// launch reports an error, or CUDA reports runtime initialization
466    /// diagnostics.
467    pub fn synchronize() -> Result<()> {
468        unsafe {
469            try_ffi!(runtime::cudaDeviceSynchronize())?;
470        }
471        Ok(())
472    }
473
474    /// Explicitly destroys and cleans up all resources associated with the current device in the current process.
475    /// Accessing these resources or passing them to subsequent API calls after
476    /// reset results in undefined behavior.
477    /// These resources include streams, events, arrays, mipmapped arrays, pitched allocations, texture and surface objects, external memory and semaphore objects, and graphics resources owned by the current device state.
478    /// These resources also include memory allocations by [`DeviceMemory::alloc`](crate::memory::DeviceMemory::alloc), [`DeviceMemory::alloc_host`](crate::memory::DeviceMemory::alloc_host), [`DeviceMemory::alloc_managed`](crate::memory::DeviceMemory::alloc_managed) and [`sys::cudaMallocPitch`](singe_cuda_sys::runtime::cudaMallocPitch).
479    /// Any subsequent call to this device reinitializes it.
480    ///
481    /// This call resets the device immediately.
482    /// Ensure that no other host threads in the process are accessing the device when this is called.
483    ///
484    /// * [`Device::reset`] does not destroy memory allocated by [`DeviceMemory::alloc_async`](crate::memory::DeviceMemory::alloc_async) or [`sys::cudaMallocFromPoolAsync`](singe_cuda_sys::runtime::cudaMallocFromPoolAsync).
485    ///   These memory allocations must be destroyed explicitly.
486    /// * If a non-primary CUDA context is current to the thread, [`Device::reset`] will destroy only the internal CUDA runtime state for that context.
487    ///
488    /// # Errors
489    ///
490    /// Returns an error if device reset fails, a previous asynchronous launch
491    /// reports an error, or CUDA reports runtime initialization diagnostics.
492    pub fn reset() -> Result<()> {
493        unsafe {
494            try_ffi!(runtime::cudaDeviceReset())?;
495        }
496        Ok(())
497    }
498
499    /// Returns the current size of limit.
500    /// The following [`Limit`] values are supported.
501    ///
502    /// * [`Limit::StackSize`] is the stack size in bytes of each GPU thread.
503    /// * [`Limit::PrintfFifoSize`] is the size in bytes of the shared FIFO used by the `printf()` device system call.
504    /// * [`Limit::MallocHeapSize`] is the size in bytes of the heap used by the `malloc()` and `free()` device system calls.
505    /// * [`Limit::DevRuntimeSyncDepth`] is the maximum grid depth at which a thread can issue the device runtime call [`Device::synchronize`] to wait on child grid launches to complete.
506    ///   This feature is removed for devices of compute capability &gt;= 9.0, so such devices return [`crate::error::Status::UnsupportedLimit`].
507    /// * [`Limit::DevRuntimePendingLaunchCount`] is the maximum number of outstanding device runtime launches.
508    /// * [`Limit::MaxL2FetchGranularity`] is the L2 cache fetch granularity.
509    /// * [`Limit::PersistingL2CacheSize`] is the persisting L2 cache size in bytes.
510    ///
511    /// # Errors
512    ///
513    /// Returns an error if `limit` is unsupported, CUDA cannot query the limit,
514    /// a previous asynchronous launch reports an error, or CUDA reports runtime
515    /// initialization diagnostics.
516    pub fn limit(limit: Limit) -> Result<usize> {
517        let mut value = 0;
518        unsafe {
519            try_ffi!(runtime::cudaDeviceGetLimit(&raw mut value, limit.into(),))?;
520        }
521        Ok(value as _)
522    }
523
524    /// Setting limit to value is a request by the application to update the current limit maintained by the device.
525    /// The driver may modify the requested value to meet hardware requirements, such as clamping to minimum or maximum values or rounding up to the nearest element size.
526    /// Use [`Device::limit`] to query the effective value.
527    ///
528    /// Setting each [`Limit`] has its own specific restrictions, so each is discussed here.
529    ///
530    /// * [`Limit::StackSize`] controls the stack size in bytes of each GPU thread.
531    ///
532    /// * [`Limit::PrintfFifoSize`] controls the size in bytes of the shared FIFO used by the `printf()` device system call.
533    ///   Setting [`Limit::PrintfFifoSize`] must not be performed after launching any kernel that uses the `printf()` device system call; otherwise [`crate::error::Status::InvalidValue`] is returned.
534    ///
535    /// * [`Limit::MallocHeapSize`] controls the size in bytes of the heap used by the `malloc()` and `free()` device system calls.
536    ///   Setting [`Limit::MallocHeapSize`] must not be performed after launching any kernel that uses the `malloc()` or `free()` device system calls; otherwise [`crate::error::Status::InvalidValue`] is returned.
537    ///
538    /// * [`Limit::DevRuntimeSyncDepth`] controls the maximum nesting depth of a grid at which a thread can safely call [`Device::synchronize`].
539    ///   Setting this limit must be performed before any launch of a kernel that uses the device runtime and calls [`Device::synchronize`] above the default sync depth, two levels of grids.
540    ///   Calls to [`Device::synchronize`] fail if this limit is violated.
541    ///   This limit can be set smaller than the default or up to the maximum launch depth of 24.
542    ///   Additional sync-depth levels require the runtime to reserve large amounts of device memory that can no longer be used for application allocations.
543    ///   If these device-memory reservations fail, [`Device::set_limit`] returns an error, and the limit can be reset to a lower value.
544    ///   This limit is only applicable to devices of compute capability &lt; 9.0.
545    ///   Setting this limit on devices with other compute capabilities returns [`crate::error::Status::UnsupportedLimit`].
546    ///
547    /// * [`Limit::DevRuntimePendingLaunchCount`] controls the maximum number of outstanding device runtime launches that can be made from the current device.
548    ///   A grid is outstanding from launch until it is known to have completed.
549    ///   Device runtime launches that violate this limit fail.
550    ///   If a module using the device runtime needs more pending launches than the default 2048 launches, this limit can be increased.
551    ///   Sustaining additional pending launches requires the runtime to reserve larger amounts of device memory up front, which can no longer be used for allocations.
552    ///   If these reservations fail, [`Device::set_limit`] returns an error, and the limit can be reset to a lower value.
553    ///   This limit is only applicable to devices of compute capability 3.5 and higher.
554    ///   Setting this limit on devices with compute capability less than 3.5 returns [`crate::error::Status::UnsupportedLimit`].
555    ///
556    /// * [`Limit::MaxL2FetchGranularity`] controls the L2 cache fetch granularity.
557    ///   Values can range from 0B to 128B.
558    ///   Performance hint that can be ignored or clamped depending on the platform.
559    ///
560    /// * [`Limit::PersistingL2CacheSize`] controls size in bytes available for persisting L2 cache.
561    ///   Performance hint that can be ignored or clamped depending on the platform.
562    ///
563    /// # Errors
564    ///
565    /// Returns an error if `limit` is unsupported, `value` is invalid for that
566    /// limit, CUDA cannot set the limit, a previous asynchronous launch reports
567    /// an error, or CUDA reports runtime initialization diagnostics.
568    pub fn set_limit(limit: Limit, value: usize) -> Result<()> {
569        unsafe {
570            try_ffi!(runtime::cudaDeviceSetLimit(limit.into(), value as _))?;
571        }
572        Ok(())
573    }
574
575    /// Records flags as the flags for the current device.
576    /// If the current device has been set and that device has already been initialized, the previous flags are overwritten.
577    /// If the current device has not been initialized, it is initialized with the provided flags.
578    /// If no device has been made current to the calling thread, a default device is selected and initialized with the provided flags.
579    ///
580    /// The three least significant bits of `flags` control how the CPU thread interacts with the OS scheduler while waiting for device results.
581    ///
582    /// * [`ContextFlags::SCHEDULE_AUTO`]: The default value if `flags` is zero.
583    ///   Uses a heuristic based on the number of active CUDA contexts in the process (`C`) and the number of logical processors in the system (`P`).
584    ///   If `C > P`, CUDA yields to other OS threads when waiting for the device; otherwise, CUDA actively spins while waiting for results.
585    ///   Additionally, on Tegra devices, [`ContextFlags::SCHEDULE_AUTO`] uses a heuristic based on the power profile of the platform and may choose [`ContextFlags::SCHEDULE_BLOCKING_SYNC`] for low-powered devices.
586    /// * [`ContextFlags::SCHEDULE_SPIN`]: Instruct CUDA to actively spin when waiting for results from the device.
587    ///   This can decrease latency when waiting for the
588    ///   device, but may lower the performance of CPU threads if they are performing work in parallel with the CUDA thread.
589    /// * [`ContextFlags::SCHEDULE_YIELD`]: Instruct CUDA to yield its thread when waiting for results from the device.
590    ///   This can increase latency when waiting for the
591    ///   device, but can increase the performance of CPU threads performing work in parallel with the device.
592    /// * [`ContextFlags::SCHEDULE_BLOCKING_SYNC`]: Instruct CUDA to block the CPU thread on a synchronization primitive when waiting for the device to finish work.
593    ///
594    /// This matches the deprecated CUDA runtime blocking-sync behavior now represented by [`ContextFlags::SCHEDULE_BLOCKING_SYNC`].
595    /// * [`ContextFlags::MAP_HOST`]: This flag enables allocating pinned host memory that is accessible to the device.
596    ///   It is implicit for the runtime but may
597    ///   be absent if a context is created using the driver API.
598    ///   If this flag is not set, [`sys::cudaHostGetDevicePointer`](singe_cuda_sys::runtime::cudaHostGetDevicePointer) always returns a failure code.
599    /// * [`ContextFlags::LOCAL_MEMORY_RESIZE_TO_MAX`]: Instruct CUDA to not reduce local memory after resizing local memory for a kernel.
600    ///   This can prevent thrashing by local memory
601    ///   allocations when launching many kernels with high local memory usage at the cost of potentially increased memory usage.
602    ///
603    /// Deprecated: this behavior is now the default and cannot be disabled.
604    /// * [`ContextFlags::SYNC_MEMORY_OPERATIONS`]: Ensures that synchronous memory operations initiated on this context always synchronize.
605    ///   See further documentation
606    ///   in the section titled "API Synchronization behavior" to learn more about cases when synchronous memory operations can exhibit
607    ///   asynchronous behavior.
608    ///
609    /// # Errors
610    ///
611    /// Returns an error if CUDA cannot set the device flags, a previous
612    /// asynchronous launch reports an error, or CUDA reports runtime
613    /// initialization diagnostics.
614    pub fn set_flags(flags: ContextFlags) -> Result<()> {
615        unsafe { try_ffi!(runtime::cudaSetDeviceFlags(flags.bits())) }
616    }
617
618    /// Returns the flags for the current device.
619    /// If there is a current device for the calling thread, the flags for the device are returned.
620    /// If there is no current device, the flags for the first device are returned, which may be the default flags.
621    /// Compare to the behavior of [`Device::set_flags`].
622    ///
623    /// Typically, the returned flags match the behavior seen if the calling
624    /// thread uses a device after this call, assuming this thread or another
625    /// thread does not change the flags or current device in between.
626    /// If the device is not initialized, another thread can change the flags for the current device before it is initialized.
627    /// Additionally, when using exclusive mode, if this thread has not requested a specific device, it may use a device other than the first device, contrary to the assumption made by this query.
628    ///
629    /// If a context has been created via the driver API and is current to the calling thread, the flags for that context are always returned.
630    ///
631    /// Returned flags may specifically include [`ContextFlags::MAP_HOST`] even though it is not accepted by [`Device::set_flags`] because it is implicit in runtime API flags.
632    /// The reason for this is that the current context may have been created via the driver API in which case the flag is not implicit and may be unset.
633    ///
634    /// # Errors
635    ///
636    /// Returns an error if CUDA cannot query the device flags, a previous
637    /// asynchronous launch reports an error, or CUDA reports runtime
638    /// initialization diagnostics.
639    pub fn flags() -> Result<ContextFlags> {
640        let mut flags_raw: u32 = 0;
641        unsafe {
642            try_ffi!(runtime::cudaGetDeviceFlags(&raw mut flags_raw))?;
643        }
644        Ok(ContextFlags::from_bits_retain(flags_raw))
645    }
646
647    /// Returns the device which has properties that best match the given prop.
648    ///
649    /// # Errors
650    ///
651    /// Returns an error if CUDA cannot choose a matching device, the selected
652    /// device cannot be represented by this wrapper, a previous asynchronous
653    /// launch reports an error, or CUDA reports runtime initialization
654    /// diagnostics.
655    pub fn choose(prop: &DeviceProperties) -> Result<Self> {
656        // This function is tricky because cudaChooseDevice takes a *template* prop,
657        // and we have a fully filled DeviceProp. We need to construct a template
658        // cudaDeviceProp FFI struct based on the criteria we care about.
659        // Often, only major/minor compute capability is used.
660        // For simplicity here, let's assume we want an exact match on major/minor.
661        // A more robust implementation would allow specifying which fields matter.
662
663        let mut ffi_prop: runtime::cudaDeviceProp = unsafe { mem::zeroed() };
664        ffi_prop.major = prop.major;
665        ffi_prop.minor = prop.minor;
666        // Maybe add other critical fields like managedMemory support if needed?
667        ffi_prop.managedMemory = i32::from(prop.managed_memory);
668
669        let mut device: i32 = -1;
670        unsafe {
671            try_ffi!(runtime::cudaChooseDevice(
672                (&raw mut device).cast(),
673                &raw const ffi_prop
674            ))?;
675        }
676        if device == -1 {
677            Err(Error::DeviceNotFound)
678        } else {
679            Ok(Self(device))
680        }
681    }
682
683    /// Returns a device ordinal given a PCI bus ID string.
684    ///
685    /// # Errors
686    ///
687    /// Returns an error if `pci_bus_id` contains an interior NUL byte, CUDA
688    /// cannot resolve the bus ID, a previous asynchronous launch reports an
689    /// error, or CUDA reports runtime initialization diagnostics.
690    pub fn by_pci_bus_id(pci_bus_id: &str) -> Result<Self> {
691        let c_pci_bus_id = CString::new(pci_bus_id)?;
692        let mut device: i32 = -1;
693        unsafe {
694            try_ffi!(runtime::cudaDeviceGetByPCIBusId(
695                (&raw mut device).cast(),
696                c_pci_bus_id.as_ptr(),
697            ))?;
698        }
699        if device == -1 {
700            Err(Error::DeviceNotFound)
701        } else {
702            Ok(Self(device))
703        }
704    }
705
706    /// Sets device as the current device for the calling host thread.
707    /// Valid device id's are 0 to ([`Device::count`] - 1).
708    ///
709    /// Device memory subsequently allocated from this host thread is physically
710    /// resident on `device`.
711    /// Host memory allocated or registered from this host thread has its
712    /// lifetime associated with `device`.
713    /// Streams and events created from this host thread are associated with
714    /// `device`.
715    /// Kernels launched from this host thread execute on `device`.
716    ///
717    /// This may be called from any host thread, for any device, at any time.
718    /// This performs no synchronization with the previous or new device, and usually only takes significant time when it initializes the runtime's context state.
719    /// This binds the primary context of the specified device to the calling
720    /// thread; subsequent memory allocations, stream and event creations, and
721    /// kernel launches are associated with that primary context.
722    /// This also immediately initializes the runtime state on the primary context, and the context is current on the device immediately.
723    /// It returns an error if the device is in [`ComputeMode::ExclusiveProcess`] and is occupied by another process, or if it is in [`ComputeMode::Prohibited`].
724    ///
725    /// It is not required to call [`sys::cudaInitDevice`](singe_cuda_sys::runtime::cudaInitDevice) before using this method.
726    ///
727    /// # Errors
728    ///
729    /// Returns an error if CUDA cannot set the current device, the device is
730    /// unavailable due to compute mode restrictions, a previous asynchronous
731    /// launch reports an error, or CUDA reports runtime initialization
732    /// diagnostics.
733    pub fn set_current(self) -> Result<()> {
734        unsafe {
735            try_ffi!(runtime::cudaSetDevice(self.0))?;
736        }
737        Ok(())
738    }
739
740    /// Returns this device's properties.
741    ///
742    /// # Errors
743    ///
744    /// Returns an error if CUDA cannot query the device properties, the
745    /// returned properties cannot be converted into the safe wrapper type, a
746    /// previous asynchronous launch reports an error, or CUDA reports runtime
747    /// initialization diagnostics.
748    pub fn properties(self) -> Result<DeviceProperties> {
749        unsafe {
750            let mut prop = MaybeUninit::<runtime::cudaDeviceProp>::uninit();
751            try_ffi!(runtime::cudaGetDeviceProperties(prop.as_mut_ptr(), self.0))?;
752            prop.assume_init().try_into()
753        }
754    }
755
756    /// Returns the PCI bus ID string identifying the device.
757    ///
758    /// # Errors
759    ///
760    /// Returns an error if CUDA cannot query the PCI bus ID, a previous
761    /// asynchronous launch reports an error, or CUDA reports runtime
762    /// initialization diagnostics.
763    pub fn pci_bus_id(self) -> Result<String> {
764        const LEN: usize = 16; // Sufficient for typical PCI IDs like 0000:01:00.0
765        let mut pci_bus_id_buf = [0i8; LEN];
766        unsafe {
767            try_ffi!(runtime::cudaDeviceGetPCIBusId(
768                pci_bus_id_buf.as_mut_ptr().cast(),
769                LEN as _,
770                self.0,
771            ))?;
772            Ok(string_from_c_chars(&pci_bus_id_buf))
773        }
774    }
775
776    /// On success, all allocations from the peer device are immediately accessible by the current device.
777    /// They remain accessible until access is explicitly disabled using [`Device::disable_peer_access`] or either device is reset using [`Device::reset`].
778    ///
779    /// Access granted by this call is unidirectional; accessing memory on the current device from the peer device requires a separate symmetric call to [`Device::enable_peer_access`].
780    ///
781    /// There are both device-wide and system-wide limitations per system configuration, as noted in the CUDA Programming Guide under the section "Peer-to-Peer Memory Access".
782    ///
783    /// Returns [`crate::error::Status::InvalidDevice`] if [`Device::can_access_peer`] indicates that the current device cannot directly access memory from the peer device.
784    ///
785    /// Returns [`crate::error::Status::PeerAccessAlreadyEnabled`] if direct access to the peer device from the current device has already been enabled.
786    ///
787    /// Returns [`crate::error::Status::InvalidValue`] if flags is not 0.
788    ///
789    /// # Errors
790    ///
791    /// Returns an error if peer access cannot be enabled, `flags` is not
792    /// [`PeerAccessFlags::DEFAULT`], the peer is invalid or already enabled, a
793    /// previous asynchronous launch reports an error, or CUDA reports runtime
794    /// initialization diagnostics.
795    pub fn enable_peer_access(self, flags: PeerAccessFlags) -> Result<()> {
796        if flags != PeerAccessFlags::DEFAULT {
797            return Err(Error::InvalidValue);
798        }
799        unsafe { try_ffi!(runtime::cudaDeviceEnablePeerAccess(self.0, flags.bits(),)) }
800    }
801
802    /// Returns [`crate::error::Status::PeerAccessNotEnabled`] if direct access to memory on the peer device has not yet been enabled from the current device.
803    ///
804    /// # Errors
805    ///
806    /// Returns an error if peer access was not enabled, CUDA cannot disable
807    /// peer access, a previous asynchronous launch reports an error, or CUDA
808    /// reports runtime initialization diagnostics.
809    pub fn disable_peer_access(self) -> Result<()> {
810        unsafe { try_ffi!(runtime::cudaDeviceDisablePeerAccess(self.0)) }
811    }
812
813    /// Returns true if this device can directly access memory from `other`.
814    /// If direct access from this device to `other` is possible, access may be enabled by calling [`Device::enable_peer_access`].
815    ///
816    /// # Errors
817    ///
818    /// Returns an error if CUDA cannot query peer accessibility, either device
819    /// is invalid, a previous asynchronous launch reports an error, or CUDA
820    /// reports runtime initialization diagnostics.
821    pub fn can_access_peer(self, other: Self) -> Result<bool> {
822        let mut can_access_peer: i32 = 0;
823        unsafe {
824            try_ffi!(runtime::cudaDeviceCanAccessPeer(
825                (&raw mut can_access_peer).cast(),
826                self.0,
827                other.0,
828            ))?;
829        }
830        Ok(can_access_peer != 0)
831    }
832
833    /// Returns the value of the requested attribute of the link between devices.
834    /// Supported attributes are represented by [`PeerToPeerAttribute`]:
835    ///
836    /// * [`PeerToPeerAttribute::PerformanceRank`]: relative performance of the link between the two devices. Lower values are better.
837    /// * [`PeerToPeerAttribute::AccessSupported`]: whether peer access is enabled.
838    /// * [`PeerToPeerAttribute::NativeAtomicSupported`]: whether native atomic operations over the link are supported.
839    /// * [`PeerToPeerAttribute::CudaArrayAccessSupported`]: whether CUDA arrays are accessible over the link.
840    ///
841    /// Returns [`crate::error::Status::InvalidDevice`] if either device is invalid or if they represent the same device.
842    ///
843    /// Returns [`crate::error::Status::InvalidValue`] if `attrib` is not valid.
844    ///
845    /// # Errors
846    ///
847    /// Returns an error if either device is invalid, `attr` is not accepted by
848    /// CUDA, CUDA cannot query the attribute, a previous asynchronous launch
849    /// reports an error, or CUDA reports runtime initialization diagnostics.
850    pub fn p2p_attribute(self, attr: PeerToPeerAttribute, other: Self) -> Result<i32> {
851        let mut value: i32 = 0;
852        unsafe {
853            try_ffi!(runtime::cudaDeviceGetP2PAttribute(
854                (&raw mut value).cast(),
855                attr.into(),
856                self.0,
857                other.0,
858            ))?;
859        }
860        Ok(value)
861    }
862
863    /// On devices where the L1 cache and shared memory use the same hardware resources, this returns the preferred cache configuration for the current device.
864    /// This setting is only a preference.
865    /// The runtime uses the requested configuration if possible, but it may choose a different configuration if required to execute functions.
866    ///
867    /// This returns [`FunctionCache::PreferNone`] on devices where the size of the L1 cache and shared memory are fixed.
868    ///
869    /// The supported cache configurations are:
870    ///
871    /// * [`FunctionCache::PreferNone`]: no preference for shared memory or L1 (default)
872    /// * [`FunctionCache::PreferShared`]: prefer larger shared memory and smaller L1 cache
873    /// * [`FunctionCache::PreferL1`]: prefer larger L1 cache and smaller shared memory
874    /// * [`FunctionCache::PreferEqual`]: prefer equal size L1 cache and shared memory
875    ///
876    /// # Errors
877    ///
878    /// Returns an error if CUDA cannot query the cache configuration, a
879    /// previous asynchronous launch reports an error, or CUDA reports runtime
880    /// initialization diagnostics.
881    pub fn cache_config() -> Result<FunctionCache> {
882        let mut config = runtime::cudaFuncCache::CU_FUNC_CACHE_PREFER_NONE;
883        unsafe {
884            try_ffi!(runtime::cudaDeviceGetCacheConfig(&raw mut config))?;
885        }
886        Ok(config.into())
887    }
888
889    /// On devices where the L1 cache and shared memory use the same hardware resources, this sets through cacheConfig the preferred cache configuration for the current device.
890    /// This setting is only a preference.
891    /// The runtime uses the requested configuration if possible, but it is free to choose a different configuration if required to execute the kernel.
892    /// Any per-kernel cache preference set through the CUDA API takes precedence over this device-wide setting.
893    /// Setting the device-wide cache configuration to [`FunctionCache::PreferNone`] causes subsequent kernel launches to prefer not changing the cache configuration unless required to launch the kernel.
894    ///
895    /// This setting does nothing on devices where the size of the L1 cache and shared memory are fixed.
896    ///
897    /// Launching a kernel with a different preference than the most recent preference setting may insert a device-side synchronization point.
898    ///
899    /// The supported cache configurations are:
900    ///
901    /// * [`FunctionCache::PreferNone`]: no preference for shared memory or L1 (default)
902    /// * [`FunctionCache::PreferShared`]: prefer larger shared memory and smaller L1 cache
903    /// * [`FunctionCache::PreferL1`]: prefer larger L1 cache and smaller shared memory
904    /// * [`FunctionCache::PreferEqual`]: prefer equal size L1 cache and shared memory
905    ///
906    /// # Errors
907    ///
908    /// Returns an error if CUDA cannot set the cache configuration, a previous
909    /// asynchronous launch reports an error, or CUDA reports runtime
910    /// initialization diagnostics.
911    pub fn set_cache_config(config: FunctionCache) -> Result<()> {
912        unsafe {
913            try_ffi!(runtime::cudaDeviceSetCacheConfig(config.into()))?;
914        }
915        Ok(())
916    }
917
918    /// Returns the least and greatest stream priority numerical values.
919    /// Stream priorities follow a convention where lower numbers represent greater priorities.
920    /// The range of meaningful stream priorities is given by \[greatest, least\].
921    /// If a stream is created with a priority value outside this range, the priority is automatically clamped.
922    /// See [`Context::create_stream_with_priority`](crate::context::Context::create_stream_with_priority) for details on creating a priority stream.
923    ///
924    /// Returns 0 for both values if the current context's device does not support stream priorities.
925    ///
926    /// # Errors
927    ///
928    /// Returns an error if CUDA cannot query the stream-priority range, a
929    /// previous asynchronous launch reports an error, or CUDA reports runtime
930    /// initialization diagnostics.
931    pub fn stream_priority_range() -> Result<StreamPriorityRange> {
932        let mut least = 0;
933        let mut greatest = 0;
934        unsafe {
935            try_ffi!(runtime::cudaDeviceGetStreamPriorityRange(
936                &raw mut least,
937                &raw mut greatest,
938            ))?;
939        }
940        Ok(StreamPriorityRange { least, greatest })
941    }
942
943    pub const fn id(self) -> DeviceId {
944        self.0
945    }
946}
947
948#[cfg(all(test, feature = "testing"))]
949mod tests {
950    use super::*;
951
952    #[test]
953    fn it_works() {
954        match Device::count() {
955            Ok(count) => {
956                println!("Found {} CUDA devices.", count);
957                if count > 0 {
958                    match Device::new(0).properties() {
959                        Ok(props) => println!("Device 0: {}", props.name),
960                        Err(e) => eprintln!("error getting properties for device 0: {:?}", e),
961                    }
962                }
963            }
964            Err(e) => eprintln!("error getting device count: {:?}", e),
965        }
966    }
967}